use strict;

sub extract
{
    my $str = shift;

    ##
    ## Define the possible charcter sets...
    ##
    my $regular_euc  = q/
        (?:\xa1[\xa1-\xff]) |
        (?:\xfe[\x00-\xfe]) |
        (?:[\xa2-\xfd][\x00-\xff])
    /;
 
    my $hankaku_kana = q/(?:\x8e[\xa1-\xdf])/;
    my $ascii        = q/(?:[\x20-\x7e])/;

    ##
    ## Confused? So am I!
    ##
    ## Basically, this is what it says:
    ##
    ## regular euc ( 2 bytes ) =>
    ##      \xa1 can be followed by range \xa1 to \xff OR
    ##      \xfe can be followed by range \x00 to \xfe OR
    ##      range \xa2 to \xfd can be followed by range \x00 to \xff
    ##
    ## user defined ( 3 bytes ) =>
    ##      \x8e can be followed by sequence that follows the
    ##      "regular euc" rule. -- this has been ommited. For
    ##      my purposes this will never be used.
    ##
    ## hankaku kana ( 2 bytes ) =>
    ##      \x8e can be followed by range \xa1 to \xdf.
    ##      (Notice that since the 2 bytes fall in the range of
    ##      "user defined" encoding, we match this AFTER "user defined".
    ##      So hankaku kana is matched only when the "user defined"
    ##      case fails)
    ##
    ## ascii ( 1 byte ) =>
    ##      range \x20 to \x7e. This only includes "printable"
    ##      ASCII
    ##

    $str =~ m<
        (
             $regular_euc  |
             $hankaku_kana |
             $ascii        
        )
    >gxo
}

sub to_regexp
{
    my @tokens = @_;

    my $regexp;
    foreach my $token ( @tokens ) {
        if( length( $token ) == 2 ) {
            $regexp .= sprintf(
               '(?:\x%s)', unpack( "H*", substr( $token, 0, 1 ) )
            );
            $regexp .= sprintf(
               '(?:\x%s)', unpack( "H*", substr( $token, 1, 1 ) )
            );
        } else {
            $regexp .= $token;
        }
   }       
   $regexp;
}        

my $string = "put some japanese ( euc ) string in here -- pm doesn't accept my input, unfortunately"
my $pattern     =  "place here a pattern -- yeah, if you're malicious enough this will break";

my @tokens = extract( $pattern );
my $byte_pattern = to_regexp( @tokens );

$string =~ s/$byte_pattern/some_new_pattern/g;
print $byte_pattern, "\n";
print $string. "\n";