On Sun, Jun 13, 1999 at 06:10:27AM -0400, Bill Jones wrote: > Well, if you sit next to him, ask him why I > am getting the URL three times out of this > mess: > > > $_ = 'http://web.fccj.org/~wcjones/index.html'; > s{http:(.*?)} {<A HREF="$&">$_</A>}gs; $& is still naughty, no? Wouldn't: s|(http://(.*))|<A HREF="$1">$2</A>|gs; have been better? Or, assuming you're trying to find absolute URIs in arbitrary text, here's a "more complete" command line utility I use from time to time. Its a mess, but the basic idea is there. It still trips on things like "File::Copy". #!/usr/bin/perl # Scan a doc for URLs, see if they exist. use strict; use LWP::Simple; use URI::URL; use Getopt::Long; use vars qw($Verbose); GetOptions("verbose" => \$Verbose, "v" => \$Verbose ); # From RFC 2396 # MOVE THIS SOMEWHERE BETTER! use constant ALPHA_SET => 'A-Za-z'; use constant DIGIT_SET => '0-9'; use constant ALPHA_NUM_SET => ALPHA_SET . DIGIT_SET; use constant SCHEME_SET => ALPHA_NUM_SET . '\+\-\.'; use constant RESERVED_SET => '\;\/\?\:\@\&\=\+\$\,'; # We want to escape &, ; and =, so remove them from the reserved set use constant RESERVED_SET_CHEAT => '\/\?\:\@\+\$\,'; use constant ESCAPED_SET => '\%' . DIGIT_SET . 'A-Fa-f'; use constant MARK_SET => '\-\_\.\!\~\*\'\(\)'; use constant UNRESERVED_SET => ALPHA_NUM_SET . MARK_SET; use constant URIC_NO_SLASH_SET => UNRESERVED_SET . ESCAPED_SET . '\;\?\:\@\&\=\+\$\,'; use constant URIC_SET => RESERVED_SET_CHEAT . UNRESERVED_SET . ESCAPED_SET; foreach my $file (@ARGV) { local *FILE; open(FILE, $file) || die "Can't open $file because $!"; my $line; while(defined($line = <FILE>)) { my($alphaSet) = ALPHA_SET; my($schemeSet) = SCHEME_SET; my($uircSet) = URIC_SET; # Don't assume http, don't accept relative URIs. URI::URL::strict(1); # Look through each line of the array looking for possible URLs. my $possibleURI; my $uri; # Yes, evil. Basically, look for something vaguely resembling a URL, # then hand it off to URI::URL for examination. If it passes, wrap it # in a <A> tag and replace it in the string. # Oh, BTW URI::URL croaks on failure, so we need the eval block. local $SIG{__DIE__} = 'DEFAULT'; $line =~ s/([$alphaSet][$schemeSet]*:[$uircSet]+)/ eval { $uri = URI::URL->new($1); }; if($@ || !defined $uri) { $1 } else { print STDERR ("Trying $file-$.: $uri\n") if $Verbose; my $enohead = 0; unless( head($uri) ){ #heuristic, trim crap $enohead = 1; if($uri =~ s|[\)\,\.]+$||) { print STDERR "Attempting heuristic... " if $Verbose; $enohead = 0 if head($uri); if($Verbose) { $enohead ? print STDERR "failed\n" : print STDERR "succes!\n"; } } } print("$file-$.: $uri\n") if $enohead; } /egx; } } -- Michael G Schwern schwern@pobox.com http://www.pobox.com/~schwern /(?:(?:(1)[.-]?)?\(?(\d{3})\)?[.-]?)?(\d{3})[.-]?(\d{4})(x\d+)?/i ==== Want to unsubscribe from this list? (Don't you love us anymore?) ==== Well, if you insist... Send mail with body "unsubscribe" to ==== fwp-request@technofile.org