in reply to Need regexes with alternations in them for testing perl...

Although this might not satisfy the real-world criterion, I would mine Regex::PreSuf for gnarly alternations. Previous to your work, it was the standard for regex alternation optimizing goodness. Here is fun code from word.t:
use Regex::PreSuf; print "1..1\n"; my $test = 1; print STDERR "# Hang on, collecting words for the next test...\n"; my %words; foreach my $dict (qw(/usr/dict/words /usr/share/dict/words)) { if (open(WORDS, $dict)) { while (<WORDS>) { chomp; $words{$_}++; } close(WORDS); } } my @words = keys %words; use Benchmark; my $MAXWORDS = 20000; if (@words) { print STDERR "# NOTE THAT THIS TEST WILL TAKE SEVERAL MINUTES.\n"; print STDERR "# And I do mean *SEVERAL* minutes.\n"; print STDERR "# We will test all the letters from 'a' to 'z',\n"; print STDERR "# both as the first and the last letters.\n"; my $ok = 0; my @az = ("a".."z"); # Throw away some of the words. @words = grep { /^[a-z]+$/ } @words; while (@words > $MAXWORDS) { splice @words, rand(@words), rand(100); } print STDERR "# Using ", scalar @words, " words.\n"; my $N0 = 2 * @words; my $N1; my $c; my @a; my @c; my $T0 = time(); # I'm trying to get 0 elapsed time to initialize some timesum coun +ters here. # Is there a better way? my $t1=new Benchmark; my $t2=$t1; # Initialized to 0, updated by each run of doit. my $naiveCreationTotal=timediff($t1,$t2); my $naiveExecutionTotal=timediff($t1,$t2); my $presufCreationTotal=timediff($t1,$t2); my $presufExecutionTotal=timediff($t1,$t2); sub doit { my ($t0, $t1); $t0 = new Benchmark; my $b = join("|", @a); $t1 = new Benchmark; my $tb = timediff($t1, $t0); $naiveCreationTotal=Benchmark::timesum($tb,$naiveCreationTotal +); print STDERR "# Naïve/create: ", timestr($tb), "\n"; print STDERR "# Naïve/execute: "; $t0 = new Benchmark; my @b = grep { /^(?:$b)$/ } @words; $t1 = new Benchmark; $tb=timediff($t1,$t0); $naiveExecutionTotal=Benchmark::timesum($tb,$naiveExecutionTot +al); print STDERR timestr($tb), "\n"; $t0 = new Benchmark; my $c = presuf(@a); $t1 = new Benchmark; my $tc = timediff($t1, $t0); $presufCreationTotal=Benchmark::timesum($tc,$presufCreationTot +al); print STDERR "# PreSuf/create: ", timestr($tc), "\n"; print STDERR "# PreSuf/execute: "; $t0 = new Benchmark; @c = grep { /^(?:$c)$/ } @words; $t1 = new Benchmark; $tc = timediff($t1, $t0); $presufExecutionTotal=Benchmark::timesum($tc,$presufExecutionT +otal); print STDERR timestr($tc), "\n"; print STDERR "# Aggregate times so far:\n"; print STDERR "# Naïve/create: ",timestr($naiveCreationTotal),"\n +"; print STDERR "# Naïve/execute: ",timestr($naiveExecutionTotal),"\ +n"; print STDERR "# Presuf/create: ",timestr($presufCreationTotal),"\ +n"; print STDERR "# PreSuf/execute: ",timestr($presufExecutionTotal)," +\n"; } sub checkit { if (@c == @a && join("\0", @a) eq join("\0", @c)) { $ok++; } else { print STDERR "# PreSuf FAILED!\n"; my %a; @a{@a} = (); my %c; @c{@c} = (); my %a_c = %a; delete @a_c{keys %c}; my %c_a = %c; delete @c_a{keys %a}; if (keys %a_c) { print STDERR "# MISSED:\n"; foreach (sort keys %a_c) { print STDERR "# $_\n"; } } if (keys %c_a) { print STDERR "# MISTOOK:\n"; foreach (sort keys %c_a) { print STDERR "# $_\n"; } } } } sub estimateit { $N1 += @a; my $dt = time() - $T0; if ($N1 && $dt) { print STDERR "# Estimated remaining testing time: ", int(($N0 - $N1)/($N1/$dt)), " seconds.\n"; } } foreach $c (@az) { @a = grep { /^$c/ } @words; if (@a) { print STDERR "# Testing ", scalar @a," words beginning with '$ +c'...\n"; doit(); checkit(); } else { print STDERR "# No words beginning with '$c'...\n"; $ok++; # not a typo } estimateit(); @a = grep { /$c$/ } @words; if (@a) { print STDERR "# Testing ", scalar @a," words ending with '$c'. +..\n"; doit(); checkit(); } else{ print STDERR "# No words ending with '$c'...\n"; $ok++; # not a typo } estimateit(); } print STDERR "# Aggregate times total:\n"; print STDERR "# Naïve/create: ",timestr($naiveCreationTotal),"\n +"; print STDERR "# Naïve/execute: ",timestr($naiveExecutionTotal),"\ +n"; print STDERR "# Presuf/create: ",timestr($presufCreationTotal),"\ +n"; print STDERR "# PreSuf/execute: ",timestr($presufExecutionTotal)," +\n"; print "not " unless $ok == 2 * @az; print "ok ", $test++, "\n"; } else { print "ok ", $test++, "# skipped: no words found\n"; }
For more a more real world flavor, substitute real, depunctuated tex for /usr/dict/words. This sort of test has the advantage of not only giving you mean measure of speedup, but allows you to easily compute a variance as well.

-Mark

Replies are listed 'Best First'.
Re^2: Need regexes with alternations in them for testing perl...
by demerphq (Chancellor) on Mar 12, 2005 at 16:18 UTC

    Thanks. This was interesting. I ended up playing around with words.t and discovered a slight bug that isn't exposed by this test. The regex /foobar|foo/ is equivelent to /foo(?:bar)?/ the regex /foo|foobar/ is equivelent to the regex /foo(?:|bar)/. Regex::PreSuf along with similar modules like Regexp::Assemble and Regexp::List will generate /foo(?:bar)?/ for both. Eg:

    G:\cblead2\win32>perl -e "print $& if 'foobar'=~/foo|foobar/" foo G:\cblead2\win32>perl -e "print $& if 'foobar'=~/foo(?:|bar)/" foo G:\cblead2\win32>perl -e "print $& if 'foobar'=~/foobar|foo/" foobar G:\cblead2\win32>perl -e "print $& if 'foobar'=~/foo(?:bar)?/" foobar

    Unfortunately this makes them not entirely suitable for my needs. But it has proved interesting using them. :-)

    ---
    demerphq