376 lines
11 KiB
Perl
Executable File
376 lines
11 KiB
Perl
Executable File
#!/usr/local/bin/perl
|
|
# Used with permission by Bunyip Information Systems
|
|
#
|
|
# This program filters raw ls listings files to produce a file that archie
|
|
# can parse. It started life as the filt program in the original archie
|
|
# distribution and was converted to perl and modified by Amos Shapira.
|
|
# Then significant modifications and improvements, as well as many hours of
|
|
# testing, were done by Eric Anderson at SURAnet (eanders@sura.net).
|
|
#
|
|
# Please report any bugs or send any patches to archie-admin@sura.net
|
|
# Also regular expression gurus should feel free to comment what various
|
|
# undocumented sections of the code actually does.
|
|
#
|
|
# Notes:
|
|
# Need to add to fixpermerrors a check for whatever stupid thing the site
|
|
# is sticking onto the front.
|
|
# SITES we do not handle yet:
|
|
# eba.eb.ele.tue.nl
|
|
# harvard.harvard.edu
|
|
# biox.unibas.ch
|
|
# garbo.uwasa.fi
|
|
# inria.inria.fr
|
|
# thumper.bellcore.com
|
|
# Filter version 1.0.4
|
|
$debuglevel=0;
|
|
# Debuglevels:
|
|
$\ = "\n"; # automatically add newline on print
|
|
@legit = ("[dl-]","[r-]","[w-]","[xsS-]","[r-]","[w-]",
|
|
"[xsS-]","[r-]","[w-]","[x-]",
|
|
"(\\d| )","(\\d| )","(\\d| )","[ \\d\\w]","[ \\d\\w]");
|
|
#for ($loop=
|
|
# Legit patterns for chars in permission part of line.
|
|
|
|
$fulllegit = join("",@legit);
|
|
$aixlegit = join("","^[DF]",@legit[1..9]);
|
|
$dirperms = join("",'d',@legit[1..9]);
|
|
$permlegit = join("","^",@legit[0..9]);
|
|
$idlegit = "^(\\w|\\d|-)+$";
|
|
#print "'$dirperms'";
|
|
$badchar = "[^ -~\\t]";
|
|
$corruptpattern = join('',$badchar,".*",$badchar,".*",$badchar,".*");
|
|
$filenamebegin= -1; # Automagically initialized on the first directory.
|
|
$printedaline=0;
|
|
$maxdiraddsize=40;
|
|
$curdir="";
|
|
undef @basedirs;
|
|
$lastline="@@@beginning";
|
|
|
|
for (;<>;) {
|
|
chop;
|
|
# Commented out study because it was making some stuff break, I don't
|
|
# quite know why, below is code which breaks if it is below the study.
|
|
# print "$_";
|
|
# print "MOOOOF" if /$dirperms/o;
|
|
# next;
|
|
# study;
|
|
|
|
die ("@@@ Belch -- Corrupted input?\n") if /$corruptpattern/o;
|
|
|
|
# If any of these cases are true, the line is not printed.
|
|
|
|
# Remove totaling lines
|
|
next if (/^total[ \t]+\d+[ \t]*$/o);
|
|
next if (/^Total:[ \t]+\d+[ \t]*kbytes$/o);
|
|
# Something with opendir in it.
|
|
next if (/^opendir:/o);# {
|
|
# Toss we are in europe lines
|
|
next if (/WE ARE IN EUROPE/o);
|
|
# Don't print character or block devices.
|
|
next if (/[cb][-rwxSsTt]{9}/o);
|
|
# Chuck lines having that pattern in them.
|
|
next if (/can not access/o);
|
|
# Chuck lines as seen below.
|
|
next if (/stale nfs file handle/io);
|
|
# Another bizarre case
|
|
next if (/^\.:?$/o);
|
|
# Throw away leading blank lines
|
|
next if (/^$/ && !$printedaline);
|
|
# I wonder what this does.
|
|
next if (/^[ \t]/o);
|
|
# Throw away lines containing unreadable.
|
|
next if (/unreadable/o);
|
|
# Throw away lines which have : no /dev/zero at the end of them
|
|
next if (/: no \/dev\/zero$/o);
|
|
# Throw away lines which have No such file or directory in them
|
|
next if (/No such file or directory$/o);
|
|
# Throw away lines with crt0: no /usr/lib/ld.so -- for sparta.spartacus.com
|
|
next if (/^crt0: no \/usr\/lib\/ld.so$/o);
|
|
# Throw away short lines which aren't blank and aren't directories.
|
|
# First seen on cs.tut.fi
|
|
next if ((length($_)<10)&&(!/:$/o)&&(!/^$/o)&&!(/^\.|\//o));
|
|
next if ((length($_)<$filenamebegin) && /^$fulllegit/o &&
|
|
!(/Permission denied/o||/not found/o||/cannot access/o));
|
|
# Throw away ld.so warnings
|
|
next if (/^ld.so: warning: /o);
|
|
# Throw away more ld.so errors
|
|
next if (/^ld.so: map heap error \(22\) for \/dev\/zero/o);
|
|
# Throw away the line ${org}: for eba.eb.ele.tue.nl
|
|
next if (/^\$\{org\}\:$/o);
|
|
# Throw away lines with connection timed out for ftp.informatik.rwth-aachen.de
|
|
next if (/^ls:.*Connection timed out$/o);
|
|
# Throw away this line for aix370.rrz.uni-koeln.de
|
|
next if (/^\.disk1\:$/o);
|
|
# Remove blank lines which precede filename entries so that enter doesn't
|
|
# think they are supposed to be directory names.
|
|
if (/^$/o) {
|
|
$_ = <STDIN>;
|
|
# print STDERR "$_"; # ***
|
|
last if !defined $_;
|
|
chop;
|
|
if (!/^$fulllegit/o) {
|
|
print "";
|
|
$lastline = "";
|
|
$_ .= " "; # For the chop to eat.
|
|
redo;
|
|
}
|
|
}
|
|
|
|
if ($filenamebegin<0&&/^$dirperms/o) {
|
|
$filenamebegin = length($_);
|
|
do {
|
|
--$filenamebegin;
|
|
die ("filenamebegin dropped too much??") if $filenamebegin<20;
|
|
} until ((substr($_,$filenamebegin,1) eq " ")&&
|
|
(substr($_,$filenamebegin-2,1) =~ /\d/o));
|
|
++$filenamebegin;
|
|
}
|
|
# Make sure we don't get stuck in a loop.
|
|
$count=0;
|
|
$start=$_;
|
|
# This forces idempotency. the loop that is.
|
|
do {
|
|
++$count;
|
|
die("@@@ iterated for a long time on \n'$start'\n, never got done.\n") if $count>50;
|
|
# warn("@@@ iterations:$count") if $count>2;
|
|
$orig = $_;
|
|
# Try to put : after dir names in listing.
|
|
# bin:
|
|
# files
|
|
|
|
if ((/^\./o || /^\//o) && /\w$/o && !/Permission denied/o) {
|
|
print "";
|
|
$lastline = "";
|
|
$_ = "$_:";
|
|
}
|
|
# Dump a return in if the last line was all printable chars with a colon
|
|
# on the end, e.g. a directory, and the last line was for real.
|
|
if (!/^$fulllegit/o) {
|
|
print "" if (/^[\w\/+#-\.]+:$/o && $lastline);
|
|
# Remove an extra color from a directory name entry if it exists.
|
|
# For wuarchive.wustl.edu
|
|
s/::$/:/o;
|
|
}
|
|
|
|
#General cleanup
|
|
# Hack out garbage people put on front of listings.
|
|
if (/^\//o || /^\./o) {
|
|
s!^\./!!o;
|
|
s!^/usr/spool/ftp/!!o;
|
|
s!^/pub/!!o;
|
|
s!^/usr/local/pub/!!o;
|
|
s!^/home/ftp/pub/!!o;
|
|
s!^/ftp/pub/!!o;
|
|
s!^/com/ftp/pub/!!o;
|
|
s!^/var/spool/uucppublic/!!o;
|
|
s!^/com/ftp/sun4/pub/!!o;
|
|
s!^/users/ftp/!!o;
|
|
s!^.disk1/!!o; # For aix370.rrz.uni-koeln.de
|
|
}
|
|
|
|
# What's this do?
|
|
# s/^([-dl][-rwxSsTt]{9}.*)(\\$)/$1/o;
|
|
# s/^([-dl][-rwxSsTt]{9})(\d+)/$1 $2/o;
|
|
# Take out trailing / from directory listing
|
|
s/^(d.*)\/$/$1/o;
|
|
# Take :'s off the end of lines which aren't really directories names.
|
|
# Why would I want to do this? Note I still do.
|
|
# Do this so that the next line will work right.
|
|
s/^($fulllegit.*)\:$/$1/o;
|
|
# Hack for walhalla.informatik.uni-dortmund.de, user/group names with
|
|
# spaces in them.
|
|
s/NOT FTP/NOT_FTP/go;
|
|
# Hack for gargoyle.uchicago.edu, to fix directory with return in the name
|
|
if (/^pub\/emwq\/Mailboxes.*h$/o) {
|
|
$_ .= ":";
|
|
$foo = <STDIN>;
|
|
}
|
|
# Two hacks for eba.eb.ele.tue.nl
|
|
if (/^l.*local.$/o) {
|
|
s/^l(.*local)./d$1/o;
|
|
}
|
|
if (/^pub\/apollo\/local\/News\:$/o) {
|
|
print "pub/apollo/local:";
|
|
print "drwxrwxrwx 1 news 15 Mar 15 12:00 News";
|
|
print "";
|
|
}
|
|
|
|
#Put space between permissions and id.
|
|
if (/^[ld-][r-][w-][x-]/o && /^..........\d/o) {
|
|
s/^(..........)/$1 /o;
|
|
if (substr($_,$filenamebegin-1,2) eq ' ') {
|
|
substr($_,$filenamebegin-1,2) = ' ';
|
|
}
|
|
}
|
|
|
|
#Fix AIX bogosity
|
|
if (/$aixlegit/o) {
|
|
# Aix ls follows directory symlinks.
|
|
s/ \-\> .*$//o if (/^D/o && / \-\> /o);
|
|
s/^D/d/o;
|
|
s/^F/-/o;
|
|
}
|
|
#Special hack for earth.rs.itd.umich.edu
|
|
if (/^mac\.bin\/\.AppleDesktop\/_:$/o && !$hack'umich_edu) {
|
|
# print STDERR "@Did hack for earth on $_."; # ***
|
|
$hack'umich_edu=1;
|
|
while(!/^$/o) {
|
|
$_=<STDIN>;
|
|
}
|
|
}
|
|
|
|
#Replace trailing spaces with underscores in directory listings.
|
|
$spacepos=rindex($_," ");
|
|
while (($spacepos>=$filenamebegin)
|
|
&&((/ _*$/o)||
|
|
(substr($_,$filenamebegin-6) =~ /^(\d| )\d(\d|\:)\d\d _* /o))) {
|
|
# Roughly that regexp is time (13:45) or year ( 1990)
|
|
s/ (_*)$/_$1/o;
|
|
substr($_,$filenamebegin-6) =~
|
|
s/^(..... )(_*) /$1$2_/o;
|
|
$spacepos=rindex($_," ");
|
|
}
|
|
#Put in leading spaces for bogus stuff.
|
|
#Ditto for the : terminated stuff.
|
|
# Also fixup anthing like foo/bar /doobie:
|
|
if (/:$/o) {
|
|
s/ (_*(:$|\/))/_$1/o while (/ _*(:$|\/)/o);
|
|
s/\/(_*) /\/_$1/o while (/\/_* /o);
|
|
}
|
|
|
|
#Complicated fixups.
|
|
if (/Permission denied/o||/not found/o||/cannot access/o||
|
|
/Connection timed out/o) {
|
|
$_ = &fixlserrors($_);
|
|
next if (! $_);
|
|
}
|
|
} until ($orig eq $_);
|
|
|
|
if (/^$dirperms/o) { # && length($_)<$maxdiraddsize) {
|
|
$dirname=$curdir . substr($_,$filenamebegin);
|
|
if (length($dirname) <$maxdiraddsize) {
|
|
push(@basedirs,$dirname);
|
|
# print STDERR "Adding '",$dirname,"' to dir list";
|
|
}
|
|
}
|
|
$curdir = substr($_,0,length($_)-1)."/" if (/:$/o);
|
|
|
|
$lastline = $_;
|
|
print;
|
|
$printedaline=1;
|
|
}
|
|
|
|
sub fixlserrors {
|
|
local ($_) = @_;
|
|
local ($first);
|
|
local ($count);
|
|
|
|
# print STDERR "Enter FixLsErrors";
|
|
# return "" if /^ls.*denied$/o;
|
|
return "" if /^(\/bin\/)?ls.*denied$/o;
|
|
# return "" if /^ls.*not found$/o;
|
|
return "" if /^(\/bin\/)?ls.*not found$/o;
|
|
return "" if /^cannot access /o;
|
|
return "" if /^lost\+found: Permission denied$/o;
|
|
# $foodebug=1 if $_ =~ /tesol: Per/;
|
|
$first = &fixpermline($_);
|
|
# print STDERR "*1$first" if $foodebug;
|
|
$count = 0;
|
|
$_ = undef;
|
|
while (!$_) {
|
|
# print STDERR "Hi";
|
|
++$count;
|
|
die ("@@@ FixLsErrors iterated too long :$count\n") if $count>200;
|
|
$_ = <STDIN>;
|
|
last if !defined($_);
|
|
# print STDERR "*a$count,$first,$_"; #***
|
|
# print STDERR "'$_'" if defined $_;
|
|
chop ;
|
|
last if /^$/o;
|
|
# print STDERR "*b$count,$first,$_"; #***
|
|
if (/denied$/o) {
|
|
$_ = "";
|
|
} else {
|
|
$_ = &fixpermline($_);
|
|
}
|
|
# print STDERR "*c$count,$first,$_"; #***
|
|
}
|
|
# print STDERR "*d$first" if $foodebug;
|
|
# print STDERR "@*$first,$_" if /pleD/o;
|
|
# print STDERR "*e$first" if $foodebug;
|
|
$_= $first . $_;
|
|
# print STDERR "*&&$_" if $foodebug;
|
|
return $_ if /^$fulllegit/o;
|
|
return $_ if /^.*:$/o;
|
|
return "";
|
|
}
|
|
|
|
sub fixpermline {
|
|
local ($_) = @_;
|
|
local ($count);
|
|
|
|
# print "@$_@";
|
|
# sys13 stuff for potemkin.cs.pdx.edu
|
|
s/(\/bin\/)?ls\s*:.*denied( \(sys13\))?$//o;
|
|
s/(\/bin\/)?ls\s*:.*not found$//o;
|
|
s/\.\/.*not found$//o;
|
|
s/cannot access .*$//o;
|
|
s/\.\/.*Connection timed out:$//o;
|
|
# print STDERR "*$_*";
|
|
return $_ if !(/Permission denied/o || /not found/o || /cannot access/o ||
|
|
/Connection timed out/o);
|
|
|
|
$rightmost=0;
|
|
$longlen=0;
|
|
foreach $elem (@basedirs) {
|
|
# if ($foodebug&&rindex($_,$elem)>=0) {
|
|
# print STDERR "@$elem";
|
|
# print STDERR "@$elem,", rindex($_,$elem);
|
|
# print STDERR "@", rindex($_,"mac/incoming");
|
|
# }
|
|
if (0<=($foo=rindex($_,$elem))) {
|
|
$bar = length($elem);
|
|
# print STDERR "@@$foo,$rightmost,$bar,$longlen,", $rightmost-$bar-1;
|
|
if ($foo>=($rightmost-$bar-1)) {
|
|
# Backup by at most by the length of the current one.
|
|
# Plus a /
|
|
$longlen=$bar;
|
|
$rightmost=$foo;
|
|
# print STDERR "!$_,$elem,$longlen,$rightmost" if /incoming\/pal/;
|
|
}
|
|
}
|
|
}
|
|
# if ($rightmost>0&&$foodebug) {
|
|
# print STDERR "Found:", substr($_,$rightmost);
|
|
# print STDERR "Returning:" , substr($_,0,$rightmost);
|
|
# }
|
|
if (/$fulllegit/o) {
|
|
$_ = substr($_,0,$rightmost);
|
|
} else {
|
|
return substr($_,0,$rightmost) if (($rightmost>0) &&
|
|
!(substr($_,$rightmost-1,1) eq "/"));
|
|
# print STDERR "nope";
|
|
}
|
|
|
|
local($acc,$orig) = ("",$_);
|
|
local(@line) = split(//o,$_);
|
|
local($x,$m);
|
|
local(@legitcopy) = @legit;
|
|
|
|
$count=0;
|
|
do {
|
|
++$count;
|
|
die ("@@@ fixpermline iterated too long:$count\n") if $count>50;
|
|
$_ = shift @line;
|
|
$m = shift @legitcopy;
|
|
# print STDERR "#$m#$_#$acc";
|
|
return $acc if !/$m/; # Don't put the o here, this changes.
|
|
# print STDERR "##$acc";
|
|
$acc .= $_;
|
|
} until $#legitcopy==-1;
|
|
$_=$orig;
|
|
s/\:\s*Permission denied//o;
|
|
s/\/(\w|\/)*\s*not found//o;
|
|
return $_;
|
|
}
|