[Date Prev][Date Next][Thread Prev][Thread Next] [Search] [Date Index] [Thread Index]

Re: [MacPerl] Introduction - use of modules: "ParseHTML.pm"




I wrote before, re/ HTML::Parser:
 >I've also written a cover-module that makes the
 >functionality available in a non-OO way.  You just call it twice, once with
 >a list of call-back routines, and again to start the parsing:


7/10/99, Bart Lateur wrote to me:
 >I'm interested. I'm sure a lot of people are. If it's big, put it on
 >some web space. But if it's only two pages, I don't really see any harm
 >in posting it to the mailing list.  :-) BTW plain text, if you can.


Dear Bert,

It's very simple code, but you can use it without reading anything about OO.

The module is about one page, below, and after that is a simple example
client script.

-rkm

btw> HTML::Entities is useful if you want "&lt;" to be translated to "<"
(etc) - but that's not OO.


============================================================================
#!/usr/bin/perl -w
use diagnostics ;
use strict ;
my $Script = 'ParseHTML.pm' ;
my $Version = '1.0' ;

package ParseHTML ;         # Is subclass of HTML::Parser.
    require HTML::Parser ;  # No internal state retained across method calls,
    use vars qw(@ISA) ;     # outside of the $self hash.
    @ISA = qw(HTML::Parser) ;

##      Copyright (c) 1999 Richard K. Moore.  All Rights Reserved.
##      Shareable under standard Perl terms.
##                  richard@cyberjournal.org

#------------------------------------------------------------------
#------------------------------------------------------------------
### Methods, overriding HTML::Parser.
###

sub new ;           # Instantiate a new document object.
sub start ;         # Process a start tag, possibly with attributes.
sub end ;           # Process an end tag.
sub text ;          # Process a chunk of text between tags.

#------------------------------------------------------------------
sub new             # Instantiate a new document object.
{
    my ( $class, $startProcRef, $endProcRef, $textProcRef ) = @_ ;

    my $self = HTML::Parser->new ;      # Create a Parser object.
    bless $self, $class ;               # Re-bless it as our own class,
                                        # or a subclass of that.

    @$self{ qw
        (  startProcRef   endProcRef   textProcRef ) } =
        ( $startProcRef, $endProcRef, $textProcRef ) ;

    $self ;                             # Return object ref.
}
#------------------------------------------------------------------
sub start           # Process a start tag, possibly with attributes.
{
    my ( $self, $Tag, $Attr, $AttrSeq, $TagText ) = @_ ;
    my $startProcRef = $self->{startProcRef} ;
    &$startProcRef( $Tag, $Attr, $AttrSeq, $TagText ) ;

}
#------------------------------------------------------------------
sub end             # Process an end tag.
{
    my($self, $Tag) = @_ ;
    my $endProcRef = $self->{endProcRef} ;
    &$endProcRef( $Tag ) ;
}
#------------------------------------------------------------------
sub text            # Process a chunk of text between tags.
{
    my ($self, $TextSnip) = @_ ;
    my $textProcRef = $self->{textProcRef} ;
    &$textProcRef( $TextSnip ) ;
}
#------------------------------------------------------------------
#------------------------------------------------------------------

1 ;             # Packages are supposed to return true when executed.

#------------------------------------------------------------------
### end script.
#------------------------------------------------------------------

============================================================================
#!/usr/bin/perl -w
use diagnostics;
use strict;
my $Script = "DumpHeadings";
my $Version = "1.0";

use ParseHTML ;
use File::Find ;        # (&find, $name) Cookbook 322.
use File::Basename ;    # &fileparse Cookbook 328

##      DumpHeadings.pl - Dump out text between <h> </h> tags.
##      Copyright (c) 1999 Richard K. Moore.  All Rights Reserved.
##      Shareable under standard Perl terms.
##                  richard@cyberjournal.org

#------------------------------------------------------------------

### Accept a list of files and directories.
### Find all the HTML files in this collection, and for each such
### $SourceFile.html, call ParseHTML to parse it, and print out the
### 'headers' to "$SourceFile.heads" in a new directory
### "$SourceDirectory.heads".

my $InHeader = 0 ;

File::Find::find (\&DumpFile, @ARGV);   ## Calls back for each root directory,
                                        ## subdirectory, and file.

sub DumpFile
{
    my $SourceFile = $File::Find::name;
    my ( $Name, $Dir, $Ext ) = fileparse( $SourceFile, '\.[^.]*' ) ;

    my $DirPlain = substr $Dir, 0, (-1 + length $Dir ) ;
    my $DirSep = substr $Dir, -1 ;
    my $OutDir = "$DirPlain.heads" ;

    if ( -d $SourceFile )
    {
        mkdir "$SourceFile.heads", 2   # Prog.Perl 187, 229.
            or die "$Script can't create $OutDir- $!\n" ;
    }
    elsif ( ( -f $SourceFile ) &&           # A file?
            ( $Ext =~ /s?htm?l?$/ ) &&      # With an HTML extension?
            ( open IN, $SourceFile ) )      # And openable?
    {
        ## We have an HTML file and it's openable: parse it.

        close IN ;

        my $DestFile = "$OutDir$DirSep$Name.heads" ;
        open OUT, ">$DestFile"
            or die  "$Script can't open $DestFile: $!\n" ;
        my $Doc = ParseHTML->new( \&StartTag, \&EndTag, \&TextSnip ) ;
            # Instantiate doc obj.

        $Doc->parse_file( $SourceFile ) ;               # Parse the file.
        close OUT
    }
}

sub StartTag
{
    my ( $Tag, $AttrHash, $AttrSeq, $TagText ) = @_ ;
    if ( $Tag =~ /^h[1-6]$/ )
    {
        $InHeader++ ;
    } ;

}

sub EndTag
{
    my ( $Tag ) = @_ ;
    if ( $Tag =~ /^h[1-6]$/ )
    {
        $InHeader = 0 ;
    } ;

}

sub TextSnip
{
    my ( $Text ) = @_ ;
    if ( $InHeader )
    {
        ## Prune extra whitespace.
        $Text =~ s/ +/ /g ;
        $Text =~ s/\n+/\n/g ;
        $Text =~ s/\t//g ;

        print OUT "$Text" ;

    }

}

============================================================================


===== Want to unsubscribe from this list?
===== Send mail with body "unsubscribe" to macperl-request@macperl.org