[Date Prev][Date Next][Thread Prev][Thread Next] [Search] [Date Index] [Thread Index]

Re: [MacPerl] 3 Questions and a Possible



Jamie McCarthy <jamie@voyager.net> writes

>Joseph, can't help you on your first and last question, but...
>
>>2. I'm trying to take a string of characters, look for "special"
>>characters and transliterate those special characters to &#XXX;
>>strings for html.
>
>My solution:
>
>$string =~ s/&/&amp;/g;
>$string =~ s/"/&quot;/g;
>$string =~ s/</&lt;/g;
>$string =~ s/>/&gt;/g;
>$string =~ s/([\200-\377])/"&#".ord($1).";"/ge;
>
>I suppose it could all be done in the last line, by using
>([\040\046\074\076\200-\377]) instead of ([\200-\377]), but I prefer to
>see those four tags spelled out.
>
>Hm, are 91-96 and 123-127 considered special as well?  I guess I should
>reread the HTML spec...  :-)


assuming you're working with Mac files, this is code I wrote a long time
ago and don't remember much about. I was running it on a UNIX box. It does
remapping of Mac characters like curly quotes into their ISO equivalent.
It's line-oriented mainly for logging purposes.

    while (<IN>)
    {
        chop;
        $ss = $_;
        $ss = &handle_non_printables($ss)
            if ($ss =~ /[^$BODYTEXT_CHARSET::]/);
        $ss = &process_HTML_tags($ss, $prelude_type);
        print OUT "$ss\n";
    }


sub  handle_non_printables ($)
{
    my ($l) = @_;
    my ($r, $c, $code, $col, $iso);

    &log("non-printable character on line $. of '$PAGE_URL::'")
        if ($LOG_NON_PRINTABLE_LINES::);
    # we go through the string character-by-character mainly to
    # catch any characters outside BODYTEXT_CHARSET+ISO 8859
    # in the unusual case neither of the log warning flags enables.
    # Efficiency isn't considered terribly important because we don't
    # expect many instances of non-printables. Had speed been important
    # we could probably have used a tr operation
    $col = 0;
    $r = "";
    foreach $c (split(//, $l)) {
        $col++;
        if ($c =~ /[^$BODYTEXT_CHARSET::]/) {
            $code = ord($c);
            $iso = $Mac_to_ISO_8859::[$code];
            &log("non-printable character 0x%02X at col %d of line $. of
'$PAGE_URL::'", $code, $col)
                unless ($iso && !$LOG_EACH_NON_PRINTABLE::);
            if ($iso && $CONVERT_ISO8859_CHARS::) {
                $c = "&#$iso;";
            } else {
                $c = "?";  # for easy eye-balling
            }
            # $c is what we pass on to output...
        }
        &log("at col %d of line $. of '$PAGE_URL::'", $col)
            if !defined($c);
        $r .= $c;
    }
    return $r;
}


# Here are conversion tables between the standard Roman encoding used
# in the Macintosh and ISO8859-1.
# Yacute only in Icelandic/Faroese variation of standard Mac Roman charset
# I don't know the best solution, but I'm mapping it to the same char code
# of 160, dagger.

# NB all ISO codes from 160 are used
@ISO_8859_to_Mac::[160..166] =     #  <==  BUG?! 160..255?
(
    202, 193, 162, 163, 219, 180, 207, 164, 172, 169,  # 160
    187, 199, 194, 208, 168, 248, 161, 177, 211, 210,
    213, 181, 166, 225, 252, 245, 188, 200, 185, 184,  # 180
    178, 192, 203, 231, 229, 204, 128, 129, 174, 130,
    233, 131, 230, 232, 237, 234, 235, 236, 220, 132,
    241, 238, 239, 205, 133, 215, 175, 244, 242, 243,
    134, 160, 222, 167, 136, 135, 137, 139, 138, 140,
    190, 141, 143, 142, 144, 145, 147, 146, 148, 149,
    221, 150, 152, 151, 153, 155, 154, 214, 191, 157,
    156, 158, 159, 224, 223, 216
);

# NB the mapping is sparse in this direction 128..252 => 160..255
# the reverse mapping could be easily computed instead of being
# hand-coded. Would be a real help if more than one input character
# set was envisaged. Of course it also removes a source of errors too.
@Mac_to_ISO_8859::[128..255] =
(
                                            196, 197,
    199, 201, 209, 214, 220, 225, 224, 226, 228, 227,  # 130
    229, 231, 233, 232, 234, 235, 237, 236, 238, 239,
    241, 243, 242, 244, 245, 246, 250, 249, 251, 252,  # 150
    221, 176, 162, 163, 167,   0, 182, 223, 168, 169,
      0,   0, 168,   0, 198, 216,   0, 177, 190,   0,  # 170
    165, 181,   0,   0, 189, 188,   0, 170, 186,   0,
    230, 248, 191, 161, 172,   0,   0,   0,   0, 171,  # 190
    187,   0, 160, 192, 195, 213,   0, 166, 173,   0,
    179, 178,   0, 180, 247, 215, 255,   0,   0, 164,  # 210
    208, 240, 222, 254, 253, 183,   0,   0,   0, 194,
    202, 193, 203, 200, 205, 206, 207, 204, 211, 212,  # 230
      0, 210, 218, 219, 217, 185,   0,   0, 175,   0,
      0,   0, 184,   0,   0,   0
);