#!/usr/bin/perl -w use strict; use Benchmark; use vars qw(@problem); @problem = ( "0 OBS", "AT LEAST", "EXTRANEOUS", "CARTESIAN", "CLOSING", "CONVERT", "DIVISION BY ZERO", "DOES NOT EXIST", "DUE TO LOOPING", "END OF MACRO", "ENDING EXECUTION", "ERROR", "ERRORABEND", "ERRORCHECK=STRICT", "EXCEED", "HANGING", "HAS 0 OBSERVATIONS", "ILLEGAL", "INCOMPLETE", "INVALID", "LOST CARD", "MATHEMAT", "MERGE STATEMENT", "MISSING", "MULTIPLE", "NOT FOUND", "NOT RESOLVED", "OBS=0", "REFERENCE", "REPEAT", "SAS CAMPUS DRIVE", "SAS SET OPTION OBS=0", "SAS WENT", "SHIFTED", "STOP", "TOO SMALL", "UNBALANCED", "UNCLOSED", "UNREF", "UNRESOLVED", "WARNING" ); open FOO, ">/dev/null" or die $!; timethese (5, {NO_REGEX => \&no_regex, CODE_REGEX => \&code_regex, BIG_REGEX => \&big_regex, MANY_REGEXES => \&many_regexes }); sub no_regex { local @ARGV = @ARGV; while(<>) { my $up= uc $_; foreach my $p ( @problem ) { if( 0 <= index($up,$p) ) { print FOO "line $.: problem: $p\n$_\n"; last; } } } } sub big_regex { local @ARGV = @ARGV; my $match = ret_match_any(@problem); while(<>) { if ($_ =~ $match) { print FOO "line $.: problem: $1\n$_\n"; } } } sub ret_match_any { # same as tilly's original } sub trie_strs { # same as tilly's original } sub many_regexes { local @ARGV = @ARGV; local @problem = map {qr/(\Q$_\E)/i} @problem; while (<>) { for my $p (@problem) { print FOO "line $.: problem: $1\n$_\n" and last if /$p/; } } } sub code_regex { local @ARGV = @ARGV; my $code= "sub { /(" . join ")/i || /(", map {"\Q$_\E"} @problem; $code .= ')/i and return $1}'; my $match= eval $code; die "$@" unless ref($match) && UNIVERSAL::isa($match,"CODE"); while(<>) { if( my $p = &$match() ) { print FOO "line $.: problem: $p\n$_\n"; } } } __END__ 5.6: chh@scallop test> perl matchtest sample.txt Benchmark: timing 5 iterations of BIG_REGEX, CODE_REGEX, MANY_REGEXES, NO_REGEX... BIG_REGEX: 90 wallclock secs (89.58 usr + 0.28 sys = 89.86 CPU) @ 0.06/s (n=5) CODE_REGEX: 53 wallclock secs (53.13 usr + 0.36 sys = 53.49 CPU) @ 0.09/s (n=5) MANY_REGEXES: 60 wallclock secs (59.27 usr + 0.28 sys = 59.55 CPU) @ 0.08/s (n=5) NO_REGEX: 44 wallclock secs (43.33 usr + 0.30 sys = 43.63 CPU) @ 0.11/s (n=5) 5.005_3: Benchmark: timing 5 iterations of BIG_REGEX, CODE_REGEX, MANY_REGEXES, NO_REGEX... BIG_REGEX: 79 wallclock secs (77.08 usr + 0.61 sys = 77.69 CPU) CODE_REGEX: 357 wallclock secs (354.97 usr + 0.64 sys = 355.61 CPU) MANY_REGEXES: 363 wallclock secs (361.99 usr + 0.73 sys = 362.72 CPU) NO_REGEX: 43 wallclock secs (42.84 usr + 0.19 sys = 43.03 CPU) The 10MB test file was generated thusly: my @chars = map chr($_), 32..127; open SAMPLE, ">sample.txt" or die $!; while (-s SAMPLE < 1024*1024*10) { my $line = join '', map { $chars[rand @chars] } 1..100; substr $line, rand(100), 0, $problem[rand @problem]; print SAMPLE "$line\n"; } close SAMPLE;