archie/release/B/scripts/filter_anonftp_unix_bsd.nowork
2024-05-28 16:48:29 +02:00

376 lines
11 KiB
Perl
Executable File

#!/usr/local/bin/perl
# Used with permission by Bunyip Information Systems
#
# This program filters raw ls listings files to produce a file that archie
# can parse. It started life as the filt program in the original archie
# distribution and was converted to perl and modified by Amos Shapira.
# Then significant modifications and improvements, as well as many hours of
# testing, were done by Eric Anderson at SURAnet (eanders@sura.net).
#
# Please report any bugs or send any patches to archie-admin@sura.net
# Also regular expression gurus should feel free to comment what various
# undocumented sections of the code actually does.
#
# Notes:
# Need to add to fixpermerrors a check for whatever stupid thing the site
# is sticking onto the front.
# SITES we do not handle yet:
# eba.eb.ele.tue.nl
# harvard.harvard.edu
# biox.unibas.ch
# garbo.uwasa.fi
# inria.inria.fr
# thumper.bellcore.com
# Filter version 1.0.4
$debuglevel=0;
# Debuglevels:
$\ = "\n"; # automatically add newline on print
@legit = ("[dl-]","[r-]","[w-]","[xsS-]","[r-]","[w-]",
"[xsS-]","[r-]","[w-]","[x-]",
"(\\d| )","(\\d| )","(\\d| )","[ \\d\\w]","[ \\d\\w]");
#for ($loop=
# Legit patterns for chars in permission part of line.
$fulllegit = join("",@legit);
$aixlegit = join("","^[DF]",@legit[1..9]);
$dirperms = join("",'d',@legit[1..9]);
$permlegit = join("","^",@legit[0..9]);
$idlegit = "^(\\w|\\d|-)+$";
#print "'$dirperms'";
$badchar = "[^ -~\\t]";
$corruptpattern = join('',$badchar,".*",$badchar,".*",$badchar,".*");
$filenamebegin= -1; # Automagically initialized on the first directory.
$printedaline=0;
$maxdiraddsize=40;
$curdir="";
undef @basedirs;
$lastline="@@@beginning";
for (;<>;) {
chop;
# Commented out study because it was making some stuff break, I don't
# quite know why, below is code which breaks if it is below the study.
# print "$_";
# print "MOOOOF" if /$dirperms/o;
# next;
# study;
die ("@@@ Belch -- Corrupted input?\n") if /$corruptpattern/o;
# If any of these cases are true, the line is not printed.
# Remove totaling lines
next if (/^total[ \t]+\d+[ \t]*$/o);
next if (/^Total:[ \t]+\d+[ \t]*kbytes$/o);
# Something with opendir in it.
next if (/^opendir:/o);# {
# Toss we are in europe lines
next if (/WE ARE IN EUROPE/o);
# Don't print character or block devices.
next if (/[cb][-rwxSsTt]{9}/o);
# Chuck lines having that pattern in them.
next if (/can not access/o);
# Chuck lines as seen below.
next if (/stale nfs file handle/io);
# Another bizarre case
next if (/^\.:?$/o);
# Throw away leading blank lines
next if (/^$/ && !$printedaline);
# I wonder what this does.
next if (/^[ \t]/o);
# Throw away lines containing unreadable.
next if (/unreadable/o);
# Throw away lines which have : no /dev/zero at the end of them
next if (/: no \/dev\/zero$/o);
# Throw away lines which have No such file or directory in them
next if (/No such file or directory$/o);
# Throw away lines with crt0: no /usr/lib/ld.so -- for sparta.spartacus.com
next if (/^crt0: no \/usr\/lib\/ld.so$/o);
# Throw away short lines which aren't blank and aren't directories.
# First seen on cs.tut.fi
next if ((length($_)<10)&&(!/:$/o)&&(!/^$/o)&&!(/^\.|\//o));
next if ((length($_)<$filenamebegin) && /^$fulllegit/o &&
!(/Permission denied/o||/not found/o||/cannot access/o));
# Throw away ld.so warnings
next if (/^ld.so: warning: /o);
# Throw away more ld.so errors
next if (/^ld.so: map heap error \(22\) for \/dev\/zero/o);
# Throw away the line ${org}: for eba.eb.ele.tue.nl
next if (/^\$\{org\}\:$/o);
# Throw away lines with connection timed out for ftp.informatik.rwth-aachen.de
next if (/^ls:.*Connection timed out$/o);
# Throw away this line for aix370.rrz.uni-koeln.de
next if (/^\.disk1\:$/o);
# Remove blank lines which precede filename entries so that enter doesn't
# think they are supposed to be directory names.
if (/^$/o) {
$_ = <STDIN>;
# print STDERR "$_"; # ***
last if !defined $_;
chop;
if (!/^$fulllegit/o) {
print "";
$lastline = "";
$_ .= " "; # For the chop to eat.
redo;
}
}
if ($filenamebegin<0&&/^$dirperms/o) {
$filenamebegin = length($_);
do {
--$filenamebegin;
die ("filenamebegin dropped too much??") if $filenamebegin<20;
} until ((substr($_,$filenamebegin,1) eq " ")&&
(substr($_,$filenamebegin-2,1) =~ /\d/o));
++$filenamebegin;
}
# Make sure we don't get stuck in a loop.
$count=0;
$start=$_;
# This forces idempotency. the loop that is.
do {
++$count;
die("@@@ iterated for a long time on \n'$start'\n, never got done.\n") if $count>50;
# warn("@@@ iterations:$count") if $count>2;
$orig = $_;
# Try to put : after dir names in listing.
# bin:
# files
if ((/^\./o || /^\//o) && /\w$/o && !/Permission denied/o) {
print "";
$lastline = "";
$_ = "$_:";
}
# Dump a return in if the last line was all printable chars with a colon
# on the end, e.g. a directory, and the last line was for real.
if (!/^$fulllegit/o) {
print "" if (/^[\w\/+#-\.]+:$/o && $lastline);
# Remove an extra color from a directory name entry if it exists.
# For wuarchive.wustl.edu
s/::$/:/o;
}
#General cleanup
# Hack out garbage people put on front of listings.
if (/^\//o || /^\./o) {
s!^\./!!o;
s!^/usr/spool/ftp/!!o;
s!^/pub/!!o;
s!^/usr/local/pub/!!o;
s!^/home/ftp/pub/!!o;
s!^/ftp/pub/!!o;
s!^/com/ftp/pub/!!o;
s!^/var/spool/uucppublic/!!o;
s!^/com/ftp/sun4/pub/!!o;
s!^/users/ftp/!!o;
s!^.disk1/!!o; # For aix370.rrz.uni-koeln.de
}
# What's this do?
# s/^([-dl][-rwxSsTt]{9}.*)(\\$)/$1/o;
# s/^([-dl][-rwxSsTt]{9})(\d+)/$1 $2/o;
# Take out trailing / from directory listing
s/^(d.*)\/$/$1/o;
# Take :'s off the end of lines which aren't really directories names.
# Why would I want to do this? Note I still do.
# Do this so that the next line will work right.
s/^($fulllegit.*)\:$/$1/o;
# Hack for walhalla.informatik.uni-dortmund.de, user/group names with
# spaces in them.
s/NOT FTP/NOT_FTP/go;
# Hack for gargoyle.uchicago.edu, to fix directory with return in the name
if (/^pub\/emwq\/Mailboxes.*h$/o) {
$_ .= ":";
$foo = <STDIN>;
}
# Two hacks for eba.eb.ele.tue.nl
if (/^l.*local.$/o) {
s/^l(.*local)./d$1/o;
}
if (/^pub\/apollo\/local\/News\:$/o) {
print "pub/apollo/local:";
print "drwxrwxrwx 1 news 15 Mar 15 12:00 News";
print "";
}
#Put space between permissions and id.
if (/^[ld-][r-][w-][x-]/o && /^..........\d/o) {
s/^(..........)/$1 /o;
if (substr($_,$filenamebegin-1,2) eq ' ') {
substr($_,$filenamebegin-1,2) = ' ';
}
}
#Fix AIX bogosity
if (/$aixlegit/o) {
# Aix ls follows directory symlinks.
s/ \-\> .*$//o if (/^D/o && / \-\> /o);
s/^D/d/o;
s/^F/-/o;
}
#Special hack for earth.rs.itd.umich.edu
if (/^mac\.bin\/\.AppleDesktop\/_:$/o && !$hack'umich_edu) {
# print STDERR "@Did hack for earth on $_."; # ***
$hack'umich_edu=1;
while(!/^$/o) {
$_=<STDIN>;
}
}
#Replace trailing spaces with underscores in directory listings.
$spacepos=rindex($_," ");
while (($spacepos>=$filenamebegin)
&&((/ _*$/o)||
(substr($_,$filenamebegin-6) =~ /^(\d| )\d(\d|\:)\d\d _* /o))) {
# Roughly that regexp is time (13:45) or year ( 1990)
s/ (_*)$/_$1/o;
substr($_,$filenamebegin-6) =~
s/^(..... )(_*) /$1$2_/o;
$spacepos=rindex($_," ");
}
#Put in leading spaces for bogus stuff.
#Ditto for the : terminated stuff.
# Also fixup anthing like foo/bar /doobie:
if (/:$/o) {
s/ (_*(:$|\/))/_$1/o while (/ _*(:$|\/)/o);
s/\/(_*) /\/_$1/o while (/\/_* /o);
}
#Complicated fixups.
if (/Permission denied/o||/not found/o||/cannot access/o||
/Connection timed out/o) {
$_ = &fixlserrors($_);
next if (! $_);
}
} until ($orig eq $_);
if (/^$dirperms/o) { # && length($_)<$maxdiraddsize) {
$dirname=$curdir . substr($_,$filenamebegin);
if (length($dirname) <$maxdiraddsize) {
push(@basedirs,$dirname);
# print STDERR "Adding '",$dirname,"' to dir list";
}
}
$curdir = substr($_,0,length($_)-1)."/" if (/:$/o);
$lastline = $_;
print;
$printedaline=1;
}
sub fixlserrors {
local ($_) = @_;
local ($first);
local ($count);
# print STDERR "Enter FixLsErrors";
# return "" if /^ls.*denied$/o;
return "" if /^(\/bin\/)?ls.*denied$/o;
# return "" if /^ls.*not found$/o;
return "" if /^(\/bin\/)?ls.*not found$/o;
return "" if /^cannot access /o;
return "" if /^lost\+found: Permission denied$/o;
# $foodebug=1 if $_ =~ /tesol: Per/;
$first = &fixpermline($_);
# print STDERR "*1$first" if $foodebug;
$count = 0;
$_ = undef;
while (!$_) {
# print STDERR "Hi";
++$count;
die ("@@@ FixLsErrors iterated too long :$count\n") if $count>200;
$_ = <STDIN>;
last if !defined($_);
# print STDERR "*a$count,$first,$_"; #***
# print STDERR "'$_'" if defined $_;
chop ;
last if /^$/o;
# print STDERR "*b$count,$first,$_"; #***
if (/denied$/o) {
$_ = "";
} else {
$_ = &fixpermline($_);
}
# print STDERR "*c$count,$first,$_"; #***
}
# print STDERR "*d$first" if $foodebug;
# print STDERR "@*$first,$_" if /pleD/o;
# print STDERR "*e$first" if $foodebug;
$_= $first . $_;
# print STDERR "*&&$_" if $foodebug;
return $_ if /^$fulllegit/o;
return $_ if /^.*:$/o;
return "";
}
sub fixpermline {
local ($_) = @_;
local ($count);
# print "@$_@";
# sys13 stuff for potemkin.cs.pdx.edu
s/(\/bin\/)?ls\s*:.*denied( \(sys13\))?$//o;
s/(\/bin\/)?ls\s*:.*not found$//o;
s/\.\/.*not found$//o;
s/cannot access .*$//o;
s/\.\/.*Connection timed out:$//o;
# print STDERR "*$_*";
return $_ if !(/Permission denied/o || /not found/o || /cannot access/o ||
/Connection timed out/o);
$rightmost=0;
$longlen=0;
foreach $elem (@basedirs) {
# if ($foodebug&&rindex($_,$elem)>=0) {
# print STDERR "@$elem";
# print STDERR "@$elem,", rindex($_,$elem);
# print STDERR "@", rindex($_,"mac/incoming");
# }
if (0<=($foo=rindex($_,$elem))) {
$bar = length($elem);
# print STDERR "@@$foo,$rightmost,$bar,$longlen,", $rightmost-$bar-1;
if ($foo>=($rightmost-$bar-1)) {
# Backup by at most by the length of the current one.
# Plus a /
$longlen=$bar;
$rightmost=$foo;
# print STDERR "!$_,$elem,$longlen,$rightmost" if /incoming\/pal/;
}
}
}
# if ($rightmost>0&&$foodebug) {
# print STDERR "Found:", substr($_,$rightmost);
# print STDERR "Returning:" , substr($_,0,$rightmost);
# }
if (/$fulllegit/o) {
$_ = substr($_,0,$rightmost);
} else {
return substr($_,0,$rightmost) if (($rightmost>0) &&
!(substr($_,$rightmost-1,1) eq "/"));
# print STDERR "nope";
}
local($acc,$orig) = ("",$_);
local(@line) = split(//o,$_);
local($x,$m);
local(@legitcopy) = @legit;
$count=0;
do {
++$count;
die ("@@@ fixpermline iterated too long:$count\n") if $count>50;
$_ = shift @line;
$m = shift @legitcopy;
# print STDERR "#$m#$_#$acc";
return $acc if !/$m/; # Don't put the o here, this changes.
# print STDERR "##$acc";
$acc .= $_;
} until $#legitcopy==-1;
$_=$orig;
s/\:\s*Permission denied//o;
s/\/(\w|\/)*\s*not found//o;
return $_;
}