use strict;
sub extract
{
my $str = shift;
##
## Define the possible charcter sets...
##
my $regular_euc = q/
(?:\xa1[\xa1-\xff]) |
(?:\xfe[\x00-\xfe]) |
(?:[\xa2-\xfd][\x00-\xff])
/;
my $hankaku_kana = q/(?:\x8e[\xa1-\xdf])/;
my $ascii = q/(?:[\x20-\x7e])/;
##
## Confused? So am I!
##
## Basically, this is what it says:
##
## regular euc ( 2 bytes ) =>
## \xa1 can be followed by range \xa1 to \xff OR
## \xfe can be followed by range \x00 to \xfe OR
## range \xa2 to \xfd can be followed by range \x00 to \xff
##
## user defined ( 3 bytes ) =>
## \x8e can be followed by sequence that follows the
## "regular euc" rule. -- this has been ommited. For
## my purposes this will never be used.
##
## hankaku kana ( 2 bytes ) =>
## \x8e can be followed by range \xa1 to \xdf.
## (Notice that since the 2 bytes fall in the range of
## "user defined" encoding, we match this AFTER "user defined
+".
## So hankaku kana is matched only when the "user defined"
## case fails)
##
## ascii ( 1 byte ) =>
## range \x20 to \x7e. This only includes "printable"
## ASCII
##
$str =~ m<
(
$regular_euc |
$hankaku_kana |
$ascii
)
>gxo
}
sub to_regexp
{
my @tokens = @_;
my $regexp;
foreach my $token ( @tokens ) {
if( length( $token ) == 2 ) {
$regexp .= sprintf(
'(?:\x%s)', unpack( "H*", substr( $token, 0, 1 ) )
);
$regexp .= sprintf(
'(?:\x%s)', unpack( "H*", substr( $token, 1, 1 ) )
);
} else {
$regexp .= $token;
}
}
$regexp;
}
my $string = "put some japanese ( euc ) string in here -- pm doesn't a
+ccept my input, unfortunately"
my $pattern = "place here a pattern -- yeah, if you're malicious
+enough this will break";
my @tokens = extract( $pattern );
my $byte_pattern = to_regexp( @tokens );
$string =~ s/$byte_pattern/some_new_pattern/g;
print $byte_pattern, "\n";
print $string. "\n";
As I said, this is hack. I'm well aware of that. But it serves my purpose |