Re: Need regexes with alternations in them for testing perl...

Although this might not satisfy the real-world criterion, I would mine Regex::PreSuf for gnarly alternations. Previous to your work, it was the standard for regex alternation optimizing goodness. Here is fun code from word.t:

use Regex::PreSuf;

print "1..1\n";

my $test = 1;

print STDERR "# Hang on, collecting words for the next test...\n";

my %words;

foreach my $dict (qw(/usr/dict/words /usr/share/dict/words)) {
    if (open(WORDS, $dict)) {
    while (<WORDS>) {
        chomp;
        $words{$_}++;
    }
    close(WORDS);
    }
}

my @words = keys %words;

use Benchmark;

my $MAXWORDS = 20000;

if (@words) {
    print STDERR "# NOTE THAT THIS TEST WILL TAKE SEVERAL MINUTES.\n";
    print STDERR "# And I do mean *SEVERAL* minutes.\n";
    print STDERR "# We will test all the letters from 'a' to 'z',\n";
    print STDERR "# both as the first and the last letters.\n";
    my $ok = 0;
    my @az = ("a".."z");

    # Throw away some of the words.
    @words = grep { /^[a-z]+$/ } @words;

    while (@words > $MAXWORDS) {
    splice @words, rand(@words), rand(100);
    }

    print STDERR "# Using ", scalar @words, " words.\n";

    my $N0 = 2 * @words;
    my $N1;    
    my $c;
    my @a;
    my @c;
    my $T0 = time();
 
    # I'm trying to get 0 elapsed time to initialize some timesum coun
+ters here.
    # Is there a better way?
    my $t1=new Benchmark;
    my $t2=$t1;

    # Initialized to 0, updated by each run of doit.
    my $naiveCreationTotal=timediff($t1,$t2);
    my $naiveExecutionTotal=timediff($t1,$t2);
    my $presufCreationTotal=timediff($t1,$t2);
    my $presufExecutionTotal=timediff($t1,$t2);

    sub doit {
    my ($t0, $t1);
    $t0 = new Benchmark;
    my $b  = join("|", @a);
    $t1 = new Benchmark;
    my $tb = timediff($t1, $t0);
        $naiveCreationTotal=Benchmark::timesum($tb,$naiveCreationTotal
+);
    print STDERR "# Naīve/create:   ", timestr($tb), "\n";
    print STDERR "# Naīve/execute:  ";
    $t0 = new Benchmark;
    my @b = grep { /^(?:$b)$/ } @words;
    $t1 = new Benchmark;
        $tb=timediff($t1,$t0);
        $naiveExecutionTotal=Benchmark::timesum($tb,$naiveExecutionTot
+al);
        print STDERR timestr($tb), "\n";
    $t0 = new Benchmark;
    my $c  = presuf(@a);
    $t1 = new Benchmark;
    my $tc = timediff($t1, $t0);
        $presufCreationTotal=Benchmark::timesum($tc,$presufCreationTot
+al);
    print STDERR "# PreSuf/create:  ", timestr($tc), "\n";
    print STDERR "# PreSuf/execute: ";
    $t0 = new Benchmark;
    @c = grep { /^(?:$c)$/ } @words;
    $t1 = new Benchmark;
        $tc = timediff($t1, $t0);
        $presufExecutionTotal=Benchmark::timesum($tc,$presufExecutionT
+otal);
        print STDERR timestr($tc), "\n";

    print STDERR "# Aggregate times so far:\n";
    print STDERR "# Naīve/create:   ",timestr($naiveCreationTotal),"\n
+";
    print STDERR "# Naīve/execute:  ",timestr($naiveExecutionTotal),"\
+n";
    print STDERR "# Presuf/create:  ",timestr($presufCreationTotal),"\
+n";
    print STDERR "# PreSuf/execute: ",timestr($presufExecutionTotal),"
+\n";
    }

    sub checkit {
    if (@c == @a && join("\0", @a) eq join("\0", @c)) {
        $ok++;
    } else {
        print STDERR "# PreSuf FAILED!\n";
        my %a; @a{@a} = ();
        my %c; @c{@c} = ();
        my %a_c = %a; delete @a_c{keys %c};
        my %c_a = %c; delete @c_a{keys %a};
        if (keys %a_c) {
        print STDERR "# MISSED:\n";
        foreach (sort keys %a_c) {
            print STDERR "# $_\n";
        }
        }
        if (keys %c_a) {
        print STDERR "# MISTOOK:\n";
        foreach (sort keys %c_a) {
            print STDERR "# $_\n";
        }
        }
    }
    }

    sub estimateit {
    $N1 += @a;
    my $dt = time() - $T0;
    if ($N1 && $dt) {
        print STDERR "# Estimated remaining testing time: ",
                     int(($N0 - $N1)/($N1/$dt)), " seconds.\n";
    }
    }

    foreach $c (@az) {
    @a  = grep { /^$c/  } @words;
    if (@a) {
        print STDERR "# Testing ", scalar @a," words beginning with '$
+c'...\n";
        doit();
        checkit();
    } else {
        print STDERR "# No words beginning with '$c'...\n";
        $ok++; # not a typo
    }
    estimateit();

    @a  = grep { /$c$/  } @words;
    if (@a) {
        print STDERR "# Testing ", scalar @a," words ending with '$c'.
+..\n";
        doit();
        checkit();
    } else{
        print STDERR "# No words ending with '$c'...\n";
        $ok++; # not a typo
    }
    estimateit();
    }

    print STDERR "# Aggregate times total:\n";
    print STDERR "# Naīve/create:   ",timestr($naiveCreationTotal),"\n
+";
    print STDERR "# Naīve/execute:  ",timestr($naiveExecutionTotal),"\
+n";
    print STDERR "# Presuf/create:  ",timestr($presufCreationTotal),"\
+n";
    print STDERR "# PreSuf/execute: ",timestr($presufExecutionTotal),"
+\n";

    print "not " unless $ok == 2 * @az;
    print "ok ", $test++, "\n";
} else {
    print "ok ", $test++, "# skipped: no words found\n";
}
[download]

For more a more real world flavor, substitute real, depunctuated tex for /usr/dict/words. This sort of test has the advantage of not only giving you mean measure of speedup, but allows you to easily compute a variance as well.

-Mark

Comment on Re: Need regexes with alternations in them for testing perl... Download Code

Replies are listed 'Best First'.
Re^2: Need regexes with alternations in them for testing perl... by demerphq (Chancellor) on Mar 12, 2005 at 16:18 UTC
Thanks. This was interesting. I ended up playing around with words.t and discovered a slight bug that isn't exposed by this test. The regex /foobar\|foo/ is equivelent to /foo(?:bar)?/ the regex /foo\|foobar/ is equivelent to the regex /foo(?:\|bar)/. Regex::PreSuf along with similar modules like Regexp::Assemble and Regexp::List will generate /foo(?:bar)?/ for both. Eg: `G:\cblead2\win32>perl -e "print $& if 'foobar'=~/foo\|foobar/" foo G:\cblead2\win32>perl -e "print $& if 'foobar'=~/foo(?:\|bar)/" foo G:\cblead2\win32>perl -e "print $& if 'foobar'=~/foobar\|foo/" foobar G:\cblead2\win32>perl -e "print $& if 'foobar'=~/foo(?:bar)?/" foobar` [download] Unfortunately this makes them not entirely suitable for my needs. But it has proved interesting using them. :-) Read more... warning wide code (5 kB) --- demerphq	[reply] [d/l] [select]

Replies are listed 'Best First'.

Re^2: Need regexes with alternations in them for testing perl...
by demerphq (Chancellor) on Mar 12, 2005 at 16:18 UTC

Thanks. This was interesting. I ended up playing around with words.t and discovered a slight bug that isn't exposed by this test. The regex /foobar|foo/ is equivelent to /foo(?:bar)?/ the regex /foo|foobar/ is equivelent to the regex /foo(?:|bar)/. Regex::PreSuf along with similar modules like Regexp::Assemble and Regexp::List will generate /foo(?:bar)?/ for both. Eg:

G:\cblead2\win32>perl -e "print $& if 'foobar'=~/foo|foobar/"
foo
G:\cblead2\win32>perl -e "print $& if 'foobar'=~/foo(?:|bar)/"
foo
G:\cblead2\win32>perl -e "print $& if 'foobar'=~/foobar|foo/"
foobar
G:\cblead2\win32>perl -e "print $& if 'foobar'=~/foo(?:bar)?/"
foobar
[download]

Unfortunately this makes them not entirely suitable for my needs. But it has proved interesting using them. :-)