[Date Prev][Date Next][Thread Prev][Thread Next] [Search] [Date Index] [Thread Index]

Re: [MacPerl] Clarification of detecting "broken links"/invalid



Here is a program I use on our local site to verify links. It is a 
modified version of
Merlyn's code from his "Programming Perl" column in Web Techniques 
magazine. It uses the
LWP package to build a sort of mini-webspider to scan a site. Further 
comments at the bottom.
(This is a private utility, so there are a fair numberof shortcuts and 
assumptions made.)


#!/usr/bin/perl
use LWP::UserAgent;                     # Robot mechanisms
use HTML::Parser;                       # HTML parsing
use URI::URL;                           # URL parsing
use LWP::Simple qw(get);                # Basic file get
require WWW::RobotRules;                # Exclusion stuff
$opt_d = undef;                         # No debug prints
$sep = "\001";                          # separator char

$opt_z = 5;                             # Default delay
$opt_v = undef;                         # Don't verify offsite links

# Modify @ACCEPT to contain the nodes you want to spider around on
@ACCEPT = qw(farside.gsfc.nasa.gov
            webserv.gsfc.nasa.gov
            sdcd.gsfc.nasa.gov
            esdcd.gsfc.nasa.gov
            nccsinfo.gsfc.nasa.gov
            outside.gsfc.nasa.gov);     # Nodes we will look at

$nodes_cleared = 0;
#
# hverify {-v} {-z seconds} {-a 'prefix prefix prefix ...'} 
#         {-r 'prefix prefix prefix...'} URL {URL URL ...}
#
#   -v: verify operation
#   -z: number of seconds to pause before proceeding
#   -a: accept these nodename prefixes (e.g., outside.gsfc, mysite[0-9], 
etc)
#   -r: reject these nodename prefixes (ditto)
#   URL: scan these URLs

ARGSCAN:
while (@ARGV) {
   $next = shift @ARGV;
   if ($next eq "-v") {                 # verify ON
      $opt_v++;
   }
   elsif ($next eq "-z") {              # delay
      $_ = shift @ARGV;         # get the argument (only one)
      die "No interval for -z" if $_ eq undef;
      $opt_z = ($_+0 > 0 ? $_+0 : 5);           # save it
   }
   elsif ($next eq "-a") {              # accept these prefixes
      unless ($nodes_cleared) {
         @ACCEPT=();
         $nodes_cleared++;
      }
      until (($next = shift @ARGV) =~ /^-/) {
          last ARGSCAN unless $next;    # drop out if @ARGV empty
          push @ACCEPT,$next;
      }
      unshift @ARGV,$next;              # put next switch back
      next ARGSCAN;                     # and restart loop
   }
   elsif ($next eq "-r") {              # trim theses prefixes
      until (($next = shift @ARGV) =~ /^-/) {
          last ARGSCAN unless $next;    # drop out if @ARGV empty
          push @REJECT,$next;
      }
      unshift @ARGV,$next;              # put next switch back
      next ARGSCAN;                     # and restart loop
   }
   else {                               # bare - assume it's a URL
      push @CHECK,$next;
   }
}

foreach (@CHECK) {
   $_ .= "$sep$_";                      # no referring URL
}
###########################################################################
#####
# Parse keeps the search confined to the local nodes by default.
# This needs to be refined if we want to allow multi-node verifications.

sub PARSE {                     # verify existence, parse for further URLs
                                # $_[0] is the absolute URL
   my($n);
   foreach $n (@REJECT)  {      # for each leaf trim-point:
      if ($_[0] =~ m!^http://$n!) {
         return 0;
      }
   }
   foreach $n (@ACCEPT) {       # For each node to be scanned...
      return 1 if $_[0] =~ m!^http://$n!;       # Parse if it matches
   }
   return 0;                    # Don't parse - not to be scanned
}

###########################################################################
#####
sub PING {                      # verify existence, but don't parse
                                # $_[0] is the absolute URL
  # It's OK to follow if it's an http, ftp, or gopher link.
  $_[0] =~ m!^(http|ftp|gopher):!;
}

###########################################################################
#####
# This package overrides the start and get_links methods in the 
HTML::Parser
# class. This is enclosed in a block to make it a temporary local package.
{
  package ParseLink;
  @ISA = qw(HTML::Parser);

  # Check the incoming tag. Mark "a" tags as hrefs, "img" tags as srcs.
  sub start {                   # called by parse
    my $this = shift;
    my ($tag, $attr) = @_;
    if ($tag eq "a") {
      $this->{links}{$attr->{href}}++;
    } elsif ($tag eq "img") {
      $this->{links}{$attr->{src}}++;
    }
  }

  # Pull all of the found links out of this page and return as an array.
  sub get_links {
    my $this = shift;
    sort keys %{$this->{links}};
  }
}                               # end of ParseLink

###########################################################################
#####

$ua = new LWP::UserAgent;               # Create a spider object
$ua->agent("hverify/1.0");              # Give it a name
$ua->env_proxy;                         # Get proxy info (if any) from
                                        # environment variables

# Determine the robot exclusion rules for the nodes in @ACCEPT.

my $robotsrules = new WWW::RobotRules 'MOMspider/1.0';
                                        # Acting like MOMspider, so follow
                                        # rules for it
my $place;
foreach (@ACCEPT) {                     # For each node we'll access ...
   $place = "http://$_/robots.txt";     # exclusion file URL
   my $robots_txt = get $url;           # Get the rules
   $robotsrules->parse($url, $robots_txt);      # and parse them
}

$| = 1;                                 # Don't buffer

# Run this loop until nothing is left to check. We queue new URLs on at
# the end, so we get a breadth-first check.
$urlcount = 0;
URLCHECK:
  while ($thisurl = shift @CHECK) {     # Get another one to check
    ($thisurl,$refurl) = split(/$sep/,$thisurl); # Split off referring URL
    $thisurl =~ s/%7e/~/ig;             # ugh :-) put tildes back in again

    # Decide if the URL should be skipped, either because the rules
    # disallow it or we have already visited it.

    next URLCHECK if $did{$thisurl}++;          # skip this if we did it
    unless ($robotsrules->allowed($thisurl)) {
      print "[Skipping $thisurl (blocked by server rules)]";
      $did{$thisurl}++;                 # Pretend we did this one anyway
      $nowait = 1;                      # No wait for skipped links
      $urlcount++;
      next URLCHECK;                    # And go to the next one
    }

    # We add the sleep here to keep the load down on the web servers.

    sleep $opt_z unless $nowait;        # be polite and don't hog the 
server
    $nowait = 0;                        # Assume we'll need to wait

    # Check the URL. If it's valid, fetch it. Do nothing further if
    # what it points to isn't a page.

    if (PARSE $thisurl) {               # If it's a valid URL...
      print "fetching $thisurl on page $refurl\n";      # Say we are 
doing it
      $urlcount++;
      $request = new HTTP::Request('GET',$thisurl);     # Set up a GET 
request
      $response = $ua->request($request);               # do it

      unless ($response->is_success) {  # if fetch fails ...
       print
          "Cannot fetch $thisurl on page $refurl (status ",
          $response->code, " ", $response->message,")\n";       # say why
        next URLCHECK;                  # and try the next one
      }
      next URLCHECK unless $response->content_type =~ /text\/html/i;
                                        # Stop following at non-page items

      # Here we find the base URL of the page, and parse out any other
      # links to pages or images.

      $base = $response->base;          # extract base URL
      my $p = ParseLink->new;           # Prepare to parse it
      $p->parse($response->content);    # parse the page
      $p->parse(undef);                 # Tack on EOF signal (Parser req.)

      # Add all of the newly-found URLs to the list to be checked.

      for $link ($p->get_links) {       # for each link on the page ...
        $abs = url($link, $base)->abs;  # convert to an absolute URL
        debug("... $link => $abs\n");   # echo it
        push(@CHECK, "$abs$sep$thisurl");# stick it on the list to check
      }
      next URLCHECK;                    # and go round again
    }

    # This is a leaf URL. Try just to fetch it to verify that it's not 
broken.
    unless ($opt_v) {                   # -v turns this off
       if (PING $thisurl) {             # If we can reach this server ...
         print "verifying $thisurl\n";
         $urlcount++;
         for $method (qw(HEAD GET)) {   # Try both HEAD and GET methods
           $request = new HTTP::Request($method,$thisurl);# a request 
object
           $response = $ua->request($request);          # fetch!
           next URLCHECK if $response->is_success;      # We could do it, 
go on
         }
        print
           "Cannot fetch $thisurl on page $refurl (status ",
           $response->code, " ", $response->message,")\n";
         next URLCHECK;                 # Couldn't get to it
       }
    }
    else {
      print "[skipping $thisurl]\n";
      $urlcount++;
      $nowait = 1;                      # No load for skipped links
      next URLCHECK;                    # Skip it
    }

    # We've already seen this one. Note that and go on.

    print "[skipping $thisurl]\n";      # Already-seen URLs
    $nowait = 1;                        # No load for skipped links
  }

print "[Total URLs checked: $urlcount]\n";
###########################################################################
#####
sub debug {
   print @_ if $opt_d;
}

This code knows nothing about imagemaps, CGI, Javascript, or Java. Other 
than that,
you should be able to do a decent job with it. I have been running this 
code on Unix, but it doesn't have any fatal errors that I can see would 
keep it from running under MacPerl.

 --- Joe M.