[Date Prev][Date Next][Thread Prev][Thread Next] [Search] [Date Index] [Thread Index]

[MacPerl] Re: (blowjob.pl) Memory Problem was: HTML link checker?



Sorry, Darryl and all others who tried my script. - It was just the wrong
version with a bug.

Here's my latest beta version of 'blowjob.pl' (sorry for the name as well
:) which should work now. It reads all urls listed in a textfile to the
local hard disk and now produces all neccesary folders, so that the local
documents and imgs should have the right paths.

But there's a problem: It produces memory leaks (system heap) and I Don't
know why?! Sometimes it even crashes! --- Help is appreciated. (Does it
have something to do with parameter passing to subroutines and the 'my'
declarations?)

'Blowjob' is free, modify it as you like and forget about the copyright
notice... ;-) The next version will support various command line options,
but this probably only will make sense if somebody is able to fix the
memory problem...

Greetings,
           Erich.

-------------------------------------------------------------<snip
#! /usr/bin/perl

use LWP::UserAgent;
use File::Basename;
use URI::URL;
use HTML::LinkExtor;

=head1 NAME

B<Blowjob> - a simple http sucker

=head1 SYNOPSIS

I<Usage:> blowjob [<filename>]

        <filename> is a file that contains a list of URLs separated by
        newline. Default <filename> is 'blowjob.job'.

=head1 DESCRIPTION

Reads a number of http-documents from www-servers into a local
default folder. The URLs to read from are specified
in the input file on the command line.

=cut

### startup

$version = "0.37 beta";
&say( "Blowjob/$version (c) 1996 by Erich Rast\n\n" );

### filename and path & misc

$file = shift;
$file = 'blowjob.job' unless $file;     # default input file
die "Too many arguments.\n" if ( @ARGV ); # more than 1 argument ?
$home = `pwd`;
chop($home);
my $priority = 777; # read/write/execute for everybody!

use Config;
if ( $Config{ 'osname' } =~ /^macos/i ) {
        $dir_sep = ':'; $dir_up = ':';
} else { if ( $Config{ 'osname' } =~ /^msdos/i ) {
                        $dir_sep = '\\';
                 } else { $dir_sep = '/' }
                 $dir_up = '..';
        }


### create user agent

$ua = new LWP::UserAgent;
$ua->agent("Blowjob/$version");

### set up parser callback routine

sub callback {
   my ($tag, %attr) = @_;
   return if ( $tag ne 'img' and $tag ne 'a' );  # forget all but links and
images
   if ( $tag eq 'img' ) { push(@imgs, values %attr)
   } else { push(@links, values %attr)
     }
}

### make parser

$p = HTML::LinkExtor->new(\&callback);

#################
### main loop ###
#################

# globals: $host, $job, $url, $res, $wd_count, $f_count, $d_count;
$wd_count = $f_count = $d_count = 1;
local $workdir;

open( IN, $file ) || die "Cannot open jobfile '$file': $!\n";
binmode IN;

JOB:
        while ( $job = <IN> ) {
                next JOB if ( $job =~ /^#/o );
                chop ( $job );
                $url = new URI::URL $job;
                if ( $workdir = $url->host ) {
                        $workdir .= ('#' . ++$wd_count) if (-e $workdir);
                        mkdir $workdir, $priority || say&("# Couldn't
create '$workdir' with permission $priority: $!\n");
                        chdir $workdir || &say("# Couldn't chdir to
'$workdir': $!\n");
                        &get_url( $url );
                        chdir $home || &say("# Couldn't chdir to '$home':
$!\n");
                } else {
                        &say( "# Not a valid URL: '$job'\n" );
                }
        }

close IN;
&say( "Done.\n" );

### get_url( $url )
### get a url and all images and local documents

sub get_url {
        my( $url ) = @_;
        my $res = &http_get( $url );
        return unless $res;
 my $base = $res->base;
        @imgs = (); @links = ();
        $p->parse( $res->content );     # parse for img and link tags
        @imgs = map { $_ = url($_, $base)->abs; } @imgs;
        @locals = map { $_ = url($_, $base)->abs; } &get_locals( @links );
        foreach $local (@locals) {
                &http_get( $local, $base );
        }
        foreach $img (@imgs) {
                &http_get( $img, $base );
        }
}

### get_locals( @any_links ) ==> @local_abs_links
### return all local @links

sub get_locals {
        my @locals;
        foreach $link (@_) {
                unless ( $link =~ m|//|so ) { push @locals, $link }
        } # maybe map would be more elegant here
        return @locals;
}

### http_get( $url, $base ) ==> $response_object
### send http GET request and return object received
### side effect: save received object to $url path without $base

sub http_get {
        my ($url, $base ) = @_;
        my $req = new HTTP::Request GET => $url;
        my $res = $ua->request( $req );
        if ( $res->is_error ) {
                &say("# Couldn't get <$url>: ",
                 $res->code,
                 " - ", $res->message, "\n" );
                return 0;
        }
        $req = $res->request;
        $url = $req->url; # server may have returned changed url (eg. added
'/' etc.)
        $base = $res->base unless $base;
        &write_file( $res, $url, $base );
        return $res;
}

### &write_file( $response_object, $url, $base )
### write file to local disk from an absolute url and the base
### side effect: create the OS specific path if it does not yet exist

sub write_file {
        my ($res, $url, $base) = @_;
 my $old_url = $url;
        $url =~ s/($base)(\S*)/$2/g; # get $url path without $base path
        $url =~ s|/|$dir_sep|g; # change seperator from unix to our OS
        my ($file, $path) = fileparse( $url); # decide path from filename
        my @components = split( $dir_sep, $path); # split path in single dirs
        my ($count, $i) = 0; # just to be sure
        foreach $dir (@components) { # create each dir and make it current
                $dir .= ('#' . ++$d_count) if (-e $dir);
                mkdir $dir, $priority || &say( "# Couldn't create '$dir'
with permission $priority: $!\n" );
                if ( chdir $dir ) { ++$count }
                        else { &say("# Couldn't chdir to '$dir': $!\n") }
        }

        $file = 'index.html' unless $file; # default
        $file .= ('#' . ++$f_count) if (-e $file );

        open( OUT, ">" . $file ) || &say( "# Couldn't write to '$file': $!\n" );
                print OUT $res->content;
        close OUT;
        &say( "<$old_url> read as '$file'\n" );

        for ($i = 0; $i = $count; $i++) { # go back to dir we came from
                chdir $dir_up || &say("# Couldn't chdir to '$dir_up': $!\n");
        } # end of strange directory hacking
}


### say( @comments )
### display comments if necessary

sub say {
        foreach $comment (@_) { print $comment }
}

__END__