Beefy Boxes and Bandwidth Generously Provided by pair Networks
Do you know where your variables are?
 
PerlMonks  

Regexp::Approx - Use fuzzy regular expressions

by diotalevi (Canon)
on Nov 12, 2003 at 06:09 UTC ( [id://306474]=sourcecode: print w/replies, xml ) Need Help??
Category: Miscellaneous
Author/Contact Info
Description:

This module allows you to use any of String::Approx's functions from within a regular expression. It also provides a convenience function `fuzzy_qr` which provides a regex fragment that works like an inline `scalar amatch( $pattern, $test )` expression.

use Regexp::Approx 'fuzzy_qr'; use re 'eval'; $r = qr/((?:@{[fuzzy_qr('APT')]}\s)?\w+\d)$/; $apt = ( "5678 DELAWARE AVENUE AOT 123" =~ $r )[0]; print "\$apt=$apt\n";
package Regexp::Approx;
use strict;
use base 'Exporter';
use Storable;
use String::Approx;
use POSIX ':sys_wait_h';
use vars qw($VERSION @ISA @EXPORT_OK
        $SERVER_PID
        $NLEN
        $READ_BUFFER
        %IMPORTED_FUNCTIONS
        $READER_BITS
        $READER $WRITER);
{
    no strict 'refs';
    %IMPORTED_FUNCTIONS =
    map +( $_, *{"String::Approx::$_"}{'CODE'} ),
    _imported_functions();
}
@EXPORT_OK = (qw(qrmatch),
          map { substr($_,0,1,'r'); $_ }
          keys %IMPORTED_FUNCTIONS);
$VERSION = '0.01';

INIT {
    $SIG{'CHLD'} = \&_REAPER;
    $SIG{'PIPE'} = \&_SIBLING_DIED;
    _init_module();
}

END {
    _cleanup();
}

sub qrmatch {
    my $match = shift;
    my $match_len = length $match;
    $match = quotemeta $match;
    
    my $storable_decl = '';
    my $storable_obj = shift || '';
    if ( $storable_obj ) {
    $storable_decl = "q(StorableIx=2),";
    $storable_obj = Storable::freeze( $storable_obj );
    $storable_obj =~ s(([^\w %=]))(sprintf "\\x{%x}", ord $1)ge;
    $storable_obj = "qq($storable_obj),";
    }

    my $long_len = $match_len;#1+int(1.1 * $match_len);
    my $match_code =
    qq[scalar Regexp::Approx::rmatch(
               $storable_decl q($match), $storable_obj
           substr( \$_,
                       pos(),
                       (length() - pos() < $long_len)
                           ? length() - pos()
                           : $long_len ) )];

    # Seek pos() forward $match_len ticks
    my $stand_in = '(?s:'. (q[.] x $match_len) . ')';

    return qq[(?x:(?-x:\Q$match\E) # Attempt an exact match first
                  |
                  # Match approximately
              (?(?{ $match_code })
            $stand_in
            |
                    # No match
            (?!)
                  ) 
              )];
}

sub _enter_server {
    while ( 1 ) {
    my ($context,$func,@msg) = @{_read_msg()};

    if ( $context eq 'exit' or
         $func eq 'exit' ) {
        exit;
    } elsif ( $context and $IMPORTED_FUNCTIONS{$func} ) {
        my @resp;
        if ( $context eq 'list' ) {    
        @resp = &{$IMPORTED_FUNCTIONS{$func}}( @msg );
        } else {
        $resp[0] = &{$IMPORTED_FUNCTIONS{$func}}( @msg );
        }
        
        _send_msg( \@resp );
    } else {
        _send_msg( ["Command not recognized"] );
    }
    }
    # NOT REACHED
}

sub _send_msg {
    # It is *NOT* safe to invoke the re-engine now.
    my $msg =
    join( '',
          map unpack( 'H*', pack( 'N/a*', $_ ) ),
          @{$_[0]} )
        . "\n";
    return syswrite WRITER, $msg;
}

sub _read_msg {
    # It is *NOT* safe to invoke the re-engine now.

    my $msg_len;
    do {
    # Wait until there's some input
    my $bits = $READER_BITS;
    select( $READER_BITS, undef, undef, undef );
    
    sysread READER, $READ_BUFFER, 8192, length $READ_BUFFER;
    $msg_len = rindex $READ_BUFFER, "\n";
    } until ( $msg_len != -1 );

    my $msg = substr( $READ_BUFFER,
                      0,
                        1 + $msg_len,
                  '' );
    local $/ = "\n";
    chomp $msg;

    my @msg = unpack '(N/a*)*', pack 'H*', $msg;
    if ( $msg[0] and
     substr($msg[0],0,11) eq 'StorableIx=' ) {
        # This is unsafe to execute on the client - only clits send
        # storable data so this is only ever processed on the server.
        my $ix = substr(shift(@msg),11);
        $msg[$ix] = Storable::thaw( $msg[$ix] );
    }

    return \@msg;
}

sub _imported_functions {
    no strict 'refs';
    return
    grep *{"String::Approx::$_"}{'CODE'},
    grep /^a/,
    keys %String::Approx::;
}

sub _init_module {
    for my $func ( keys %IMPORTED_FUNCTIONS ) {
    my $this = $func;
    $this =~ s(^a)(r);
    eval qq[sub $this {
        # It is *NOT* safe to invoke the re-engine now
        my \@msg = \@_;
        
        my \$StorableIx;
        if ( substr( \$msg[0], 0, 11 ) eq 'StorableIx=' ) {
        \$StorableIx = 1+substr shift(\@msg), 11;
        }
        
        unshift \@msg, q($func);
        unshift \@msg, wantarray
        ? 'list'
        : 'scalar';
        
        if ( defined \$StorableIx ) {
        unshift \@msg, "StorableIx=\$StorableIx";
        }

        _send_msg( \\\@msg );
        my \$msg = _read_msg();
        return wantarray ? \@\$msg : \$msg->[-1];
    }; 1] || die $@;
    }

    $NLEN = length pack 'N', 0;

    # Most of this code was cribbed right from perlipc.
    pipe( PARENT_RDR, CHILD_WTR )
    or die "pipe() failed: $!";
    pipe ( CHILD_RDR, PARENT_WTR )
    or die "pipe() failed: $!";

    AUTO_FLUSH: {
    my $orig_fh = select CHILD_WTR;
    $| = 1;
    select PARENT_WTR;
    $| = 1;
    select $orig_fh;
    }
    
    $READ_BUFFER = '';
    
    # perlipc safe fork. See _REAPER()
    {
    my $sleep_count = 0;
    do {
        $SERVER_PID = fork;
        unless ( defined $SERVER_PID ) {
        warn "Cannot fork: $!";
        die "Bailing out: $!" if $sleep_count++ > 6;
        sleep 10;
        }
    } until defined $SERVER_PID;
    }
    
    # Henceforth report the pid with the error message.

    if ( $SERVER_PID ) {
    close PARENT_RDR
        or warn "Parent $$ couldn't close PARENT_RDR: $!";
    close PARENT_WTR
        or warn "Parent Couldn't close PARENT_WTR: $!";
    *WRITER = *CHILD_WTR;
    *READER = *CHILD_RDR;
    $READER_BITS = '';
    vec( $READER_BITS, fileno( READER ), 1 ) = 1;
    
    # Leave the module's initializer
    return 1;
    } else {
    die "Cannot fork: $!" unless defined $SERVER_PID;
    close CHILD_RDR
        or warn "Child couldn't close CHILD_RDR: $!";
    close CHILD_WTR
        or warn "Child couldn't close CHILD_WTR: $!";
    *WRITER = *PARENT_WTR;
    *READER = *PARENT_RDR;
    $READER_BITS = '';
    vec( $READER_BITS, fileno( READER ), 1 ) = 1;
    
    goto \&_enter_server();
    }
    # NOT REACHED
}

sub _cleanup {
    if ( $SERVER_PID ) {
    _send_msg( [qw(void exit)] );
    }
    close WRITER
    or warn "$$ Couldn't close WRITER: $!";
    close READER
    or warn "$$ Couldn't close READER: $!";
    return 1;
}

sub _SIBLING_DIED {
    if ( $SERVER_PID ) {
    die "Parent $$ noticed that child $SERVER_PID unexpectedly died";
    } else {
    die "Child $$ noticed that its parent(huh?) died";
    }
}

sub _REAPER {
    my $child;
    my $waitedpid;
    while (($waitedpid = waitpid( -1, WNOHANG)) > 0) {
    if ( $? ) {
        my $exit = $? >> 8;
        my $sig  = $? & 127;
        my $core = $? & 128;
        warn "Reaped $waitedpid with exit $exit on signal $sig."
        . $core ? ' Core dumped!' : '';
    }
    }
    $SIG{'CHLD'} = \&_REAPER;
}

1;

__END__

=head1 NAME

Regexp::Approx - Use fuzzy regular expressions

=head1 SYNOPSIS

  use Regexp::Approx 'qrmatch';
  use re 'eval';

  @dict = glob "/usr/share/dict/*";
  @ARGV = $dict[ rand @dict ];
  my $fuzzy_part = qrmatch( 'teric',
                                [ 'i 10%', 'D0', 'I0' ] );
  $\ = "\n";
  while (<>) {
      chomp;
      s(($fuzzy_part))[[$1]]go and print;
  }

=head1 ABSTRACT

This module allows you to use any of String::Approx's functions from w
+ithin
a regular expression. It also provides a convenience function `qrmatch
+`
which provides a regex fragment that works like an inline 
`scalar amatch( $pattern, [ optional modifier ], $test )` expression.

=head1 DESCRIPTION

Perl's regular expression engine is not reentrant so using the re-engi
+ne
while it is already operating is a big no-no. This module skirts that
limitation by forking perl and running all the String::Approx function
+s in a
separate process.

Through unseemly peeking into String::Approx this module wraps all of 
+the
'a...' functions except they start with 'r' over here. So instead of
using 'amatch' you'd use 'rmatch'.

=over 4

=item qrmatch( "pattern", [ modifiers ] )

This function returns a regular expression fragment designed to supply
+ the
first two arguments of the String::Approx::amatch function. The patter
+n is
required, the modifiers are optional. See L<String::Approx> for detail
+s on
how to construct your pattern and what options are available as modifi
+ers.

=back

=head2 EXPORT

The qrmatch function is optionally exported.

=head1 SEE ALSO

L<String::Approx>

=head1 AUTHOR

Josh Jore, E<lt>jjore@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright 2003 by Josh Jore

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself. 

=cut
Replies are listed 'Best First'.
Re: Regexp::Approx - Use fuzzy regular expressions
by Anonymous Monk on Nov 12, 2003 at 07:37 UTC

    It seems that your "fuzzy" regex isn't much different from just using dot and the length of the target. That is:

    #!/usr/bin/perl -w use strict; use lib '.'; use Regexp::Approx 'fuzzy_qr'; use re 'eval'; while(<DATA>) { chomp; print "Using '$_' as fuzzy component:\n"; my $fuzzy_part = fuzzy_qr( $_ ); my $rx = qr/((?:$fuzzy_part\s)?\w+\d)$/; if ( "5678 DELAWARE AVENUE AOT 123" =~ $rx ) { print "\tRX Match: $1\n"; } my $really_fuzzy = '.{' . length($_) . '}'; my $rfx = qr/((?:$really_fuzzy\s)?\w+\d)$/; if ( "5678 DELAWARE AVENUE AOT 123" =~ $rfx ) { print "\tRFX Match: $1\n"; } } __END__ APT XXX ^%@#!( fuzzy matches anything? x # output: Using 'APT' as fuzzy component: RX Match: AOT 123 RFX Match: AOT 123 Using 'XXX' as fuzzy component: RX Match: AOT 123 RFX Match: AOT 123 Using '^%@#!(' as fuzzy component: RX Match: UE AOT 123 RFX Match: UE AOT 123 Using 'fuzzy matches anything?' as fuzzy component: RX Match: 678 DELAWARE AVENUE AOT 123 RFX Match: 678 DELAWARE AVENUE AOT 123 Using 'x' as fuzzy component: RX Match: T 123 RFX Match: T 123

      Drat. I musta screwed up the failure case while working on the send/recv functions. It did work correctly before. Fixes to come in an hour or so.

      Got it. The code was testing the return value of print() instead of the actual match response. When I added the general function wrapper I forgot to put a `return _read_msg()` in. (Again) it works correctly for me using your sample script.

        There seems to be remaining problems, for example:

        #!/usr/bin/perl -w use strict; use lib '.'; use Regexp::Approx 'fuzzy_qr'; use re 'eval'; while(<DATA>) { chomp; print "Using '$_' as fuzzy component:\n"; my $fuzzy_part = fuzzy_qr( $_ ); my $rx = qr/($fuzzy_part)/; if ( "5678 DELAWARE AVENUE AOT 123" =~ /$rx/ ) { print "\tRX Match: $1\n"; } } __END__ APT A XXX ^%@#!( fuzzy matches anything? x #### output: Using 'APT' as fuzzy component: RX Match: AO Using 'A' as fuzzy component: RX Match: 5 Using 'XXX' as fuzzy component: amatch: $_ is undefined: what are you matching? Compilation failed in require at xxx.pl line 4, <GEN0> line 51. BEGIN failed--compilation aborted at xxx.pl line 4, <GEN0> line 51. Using '^%@#!(' as fuzzy component: Broken pipe ### platform: $ perl -v This is perl, v5.8.0 built for i686-linux

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: sourcecode [id://306474]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others goofing around in the Monastery: (5)
As of 2024-04-18 18:07 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found