Here is a program I use on our local site to verify links. It is a modified version of Merlyn's code from his "Programming Perl" column in Web Techniques magazine. It uses the LWP package to build a sort of mini-webspider to scan a site. Further comments at the bottom. (This is a private utility, so there are a fair numberof shortcuts and assumptions made.) #!/usr/bin/perl use LWP::UserAgent; # Robot mechanisms use HTML::Parser; # HTML parsing use URI::URL; # URL parsing use LWP::Simple qw(get); # Basic file get require WWW::RobotRules; # Exclusion stuff $opt_d = undef; # No debug prints $sep = "\001"; # separator char $opt_z = 5; # Default delay $opt_v = undef; # Don't verify offsite links # Modify @ACCEPT to contain the nodes you want to spider around on @ACCEPT = qw(farside.gsfc.nasa.gov webserv.gsfc.nasa.gov sdcd.gsfc.nasa.gov esdcd.gsfc.nasa.gov nccsinfo.gsfc.nasa.gov outside.gsfc.nasa.gov); # Nodes we will look at $nodes_cleared = 0; # # hverify {-v} {-z seconds} {-a 'prefix prefix prefix ...'} # {-r 'prefix prefix prefix...'} URL {URL URL ...} # # -v: verify operation # -z: number of seconds to pause before proceeding # -a: accept these nodename prefixes (e.g., outside.gsfc, mysite[0-9], etc) # -r: reject these nodename prefixes (ditto) # URL: scan these URLs ARGSCAN: while (@ARGV) { $next = shift @ARGV; if ($next eq "-v") { # verify ON $opt_v++; } elsif ($next eq "-z") { # delay $_ = shift @ARGV; # get the argument (only one) die "No interval for -z" if $_ eq undef; $opt_z = ($_+0 > 0 ? $_+0 : 5); # save it } elsif ($next eq "-a") { # accept these prefixes unless ($nodes_cleared) { @ACCEPT=(); $nodes_cleared++; } until (($next = shift @ARGV) =~ /^-/) { last ARGSCAN unless $next; # drop out if @ARGV empty push @ACCEPT,$next; } unshift @ARGV,$next; # put next switch back next ARGSCAN; # and restart loop } elsif ($next eq "-r") { # trim theses prefixes until (($next = shift @ARGV) =~ /^-/) { last ARGSCAN unless $next; # drop out if @ARGV empty push @REJECT,$next; } unshift @ARGV,$next; # put next switch back next ARGSCAN; # and restart loop } else { # bare - assume it's a URL push @CHECK,$next; } } foreach (@CHECK) { $_ .= "$sep$_"; # no referring URL } ########################################################################### ##### # Parse keeps the search confined to the local nodes by default. # This needs to be refined if we want to allow multi-node verifications. sub PARSE { # verify existence, parse for further URLs # $_[0] is the absolute URL my($n); foreach $n (@REJECT) { # for each leaf trim-point: if ($_[0] =~ m!^http://$n!) { return 0; } } foreach $n (@ACCEPT) { # For each node to be scanned... return 1 if $_[0] =~ m!^http://$n!; # Parse if it matches } return 0; # Don't parse - not to be scanned } ########################################################################### ##### sub PING { # verify existence, but don't parse # $_[0] is the absolute URL # It's OK to follow if it's an http, ftp, or gopher link. $_[0] =~ m!^(http|ftp|gopher):!; } ########################################################################### ##### # This package overrides the start and get_links methods in the HTML::Parser # class. This is enclosed in a block to make it a temporary local package. { package ParseLink; @ISA = qw(HTML::Parser); # Check the incoming tag. Mark "a" tags as hrefs, "img" tags as srcs. sub start { # called by parse my $this = shift; my ($tag, $attr) = @_; if ($tag eq "a") { $this->{links}{$attr->{href}}++; } elsif ($tag eq "img") { $this->{links}{$attr->{src}}++; } } # Pull all of the found links out of this page and return as an array. sub get_links { my $this = shift; sort keys %{$this->{links}}; } } # end of ParseLink ########################################################################### ##### $ua = new LWP::UserAgent; # Create a spider object $ua->agent("hverify/1.0"); # Give it a name $ua->env_proxy; # Get proxy info (if any) from # environment variables # Determine the robot exclusion rules for the nodes in @ACCEPT. my $robotsrules = new WWW::RobotRules 'MOMspider/1.0'; # Acting like MOMspider, so follow # rules for it my $place; foreach (@ACCEPT) { # For each node we'll access ... $place = "http://$_/robots.txt"; # exclusion file URL my $robots_txt = get $url; # Get the rules $robotsrules->parse($url, $robots_txt); # and parse them } $| = 1; # Don't buffer # Run this loop until nothing is left to check. We queue new URLs on at # the end, so we get a breadth-first check. $urlcount = 0; URLCHECK: while ($thisurl = shift @CHECK) { # Get another one to check ($thisurl,$refurl) = split(/$sep/,$thisurl); # Split off referring URL $thisurl =~ s/%7e/~/ig; # ugh :-) put tildes back in again # Decide if the URL should be skipped, either because the rules # disallow it or we have already visited it. next URLCHECK if $did{$thisurl}++; # skip this if we did it unless ($robotsrules->allowed($thisurl)) { print "[Skipping $thisurl (blocked by server rules)]"; $did{$thisurl}++; # Pretend we did this one anyway $nowait = 1; # No wait for skipped links $urlcount++; next URLCHECK; # And go to the next one } # We add the sleep here to keep the load down on the web servers. sleep $opt_z unless $nowait; # be polite and don't hog the server $nowait = 0; # Assume we'll need to wait # Check the URL. If it's valid, fetch it. Do nothing further if # what it points to isn't a page. if (PARSE $thisurl) { # If it's a valid URL... print "fetching $thisurl on page $refurl\n"; # Say we are doing it $urlcount++; $request = new HTTP::Request('GET',$thisurl); # Set up a GET request $response = $ua->request($request); # do it unless ($response->is_success) { # if fetch fails ... print "Cannot fetch $thisurl on page $refurl (status ", $response->code, " ", $response->message,")\n"; # say why next URLCHECK; # and try the next one } next URLCHECK unless $response->content_type =~ /text\/html/i; # Stop following at non-page items # Here we find the base URL of the page, and parse out any other # links to pages or images. $base = $response->base; # extract base URL my $p = ParseLink->new; # Prepare to parse it $p->parse($response->content); # parse the page $p->parse(undef); # Tack on EOF signal (Parser req.) # Add all of the newly-found URLs to the list to be checked. for $link ($p->get_links) { # for each link on the page ... $abs = url($link, $base)->abs; # convert to an absolute URL debug("... $link => $abs\n"); # echo it push(@CHECK, "$abs$sep$thisurl");# stick it on the list to check } next URLCHECK; # and go round again } # This is a leaf URL. Try just to fetch it to verify that it's not broken. unless ($opt_v) { # -v turns this off if (PING $thisurl) { # If we can reach this server ... print "verifying $thisurl\n"; $urlcount++; for $method (qw(HEAD GET)) { # Try both HEAD and GET methods $request = new HTTP::Request($method,$thisurl);# a request object $response = $ua->request($request); # fetch! next URLCHECK if $response->is_success; # We could do it, go on } print "Cannot fetch $thisurl on page $refurl (status ", $response->code, " ", $response->message,")\n"; next URLCHECK; # Couldn't get to it } } else { print "[skipping $thisurl]\n"; $urlcount++; $nowait = 1; # No load for skipped links next URLCHECK; # Skip it } # We've already seen this one. Note that and go on. print "[skipping $thisurl]\n"; # Already-seen URLs $nowait = 1; # No load for skipped links } print "[Total URLs checked: $urlcount]\n"; ########################################################################### ##### sub debug { print @_ if $opt_d; } This code knows nothing about imagemaps, CGI, Javascript, or Java. Other than that, you should be able to do a decent job with it. I have been running this code on Unix, but it doesn't have any fatal errors that I can see would keep it from running under MacPerl. --- Joe M.