elef has asked for the wisdom of the Perl Monks concerning the following question:
#!/usr/bin/perl use strict; use warnings; open (IN, "<:encoding(UTF-8)", "file.html") or die "Can't open file: $ +!"; open (OUT, ">:encoding(UTF-8)", "file.txt") or die "Can't open file: $ +!"; while (<IN>) { my %entity; my $chr; ######################################################### # translate HTML 2.0 entities ######################################################### s{ ( & # an entity starts with a semicolon ( \x23\d+ # and is either a pound (#) and numbers | # or else \w+ # has alphanumunders up to a semi ) ;? # a semi terminates AS DOES ANYTHING ELSE ( +XXX) ) } { $entity{$2} # if it's a known entity use that || # but otherwise $1 # leave what we'd found; NO WARNINGS (XXX) }gex; # execute replacement -- that's code not a +string ######################################################### # but wait! load up the %entity mappings enwrapped in # a BEGIN that the last might be first, and only execute # once, since we're in a -p "loop"; awk is kinda nice after all. ######################################################### BEGIN { %entity = ( Aacute => chr 193, #capital A, acute accent ); for $chr ( 0 .. 255 ) { $entity{ '#' . $chr } = chr $chr; } } print OUT $_; } close IN; close OUT;
|
|---|