Re^2: Benchmarking "Are all these characters in this sentence?"

Well, to get really thorough, we should also have some utf-8 string tests. But I haven't done anything with utf-8 in perl, so I'll leave writing those cases to someone else.

I've added a few 3k-ish character search strings, and a few 3k-ish character character sets... Doing the long cases moves the results more in favour of Tanktalus_AllIndex:

Short sentence and search set cases

tallulah_OriginalPost            2054/s
Tanktalus_AllRegex               2511/s
Tanktalus_AllRegex_Study         2522/s
moritz_BuildRegex_WithStudy      2595/s
moritz_BuildRegex                2715/s
varian_hash                      2983/s
RMGir_slice                      4035/s
Tanktalus_AllIndex               8219/s
RMGir_index                     12107/s


Long sentence and Short search set cases

varian_hash                      97.2/s
RMGir_slice                       115/s
moritz_BuildRegex_WithStudy      3172/s
tallulah_OriginalPost            3230/s
Tanktalus_AllRegex_Study         3319/s
Tanktalus_AllRegex               4054/s
moritz_BuildRegex                4250/s
Tanktalus_AllIndex              13032/s
RMGir_index                     17612/s

Short sentence and Long search set cases
moritz_BuildRegex_WithStudy      54.1/s
moritz_BuildRegex                54.6/s
tallulah_OriginalPost            63.6/s
Tanktalus_AllRegex               86.9/s
Tanktalus_AllRegex_Study         87.1/s
varian_hash                       161/s
RMGir_index                       285/s
RMGir_slice                       319/s
Tanktalus_AllIndex                320/s

Long sentence and Long search set cases
moritz_BuildRegex_WithStudy      54.1/s
moritz_BuildRegex                54.6/s
tallulah_OriginalPost            63.6/s
varian_hash                      86.9/s
Tanktalus_AllRegex_Study         87.7/s
Tanktalus_AllRegex               87.7/s
RMGir_slice                       135/s
RMGir_index                       250/s
Tanktalus_AllIndex                319/s

Here's the benchmark code with the added data points:

#!/usr/bin/perl -w

use strict;
use List::MoreUtils qw(all);
use Benchmark qw(cmpthese);

my @shortTestCases=(
    # sentence                   wantedChars  result
    [ "abxcd zwe rrv",           "xxv",       1 ],
    [ "abxcd zwe rrv",           "xxvq",      0 ],
    [ "abxcd zwe rrv",           "",          1 ],
    [ "The quick brown fox jumps over the lazy dog",
                                 "abcdefghijklmnopqrstuvwxyz",
                                              1 ],
    [ "The quick brown fox jumps over the lazy dog",
                                 "abcdefghijklmnopqrstuvwxyzT",
                                              1 ],
    [ "The quick brown fox jumps over the lazy dog",
                                 "abcdefghijklmnopqrstuvwxyzTU",
                                              0 ],
    [ "The quick brown fox jumps over the lazy dog",
                                 "a",         1 ],
    [ "The quick brown fox jumps over the lazy dog",
                                 "",          1 ],
    );

    # Long sentence, short wantedChars
    my @longShortTestCases = (
    [ "The quick brown fox jumps over the lazy dog" x 100,
                                 "",          1 ],
    [ "The quick brown fox jumps over the lazy dog" x 100,
                                 "a",          1 ],
    [ "The quick brown fox jumps over the lazy dog" x 100,
                                 "abcdefghijklmnopqrstuvwxyzT",
                                              1 ],
    [ "The quick brown fox jumps over the lazy dog" x 100,
                                 "abcdefghijklmnopqrstuvwxyzTU",
                                              0 ],
    );

    # Short sentence, long wantedChars
    my @shortLongTestCases = (
    [ "The quick brown fox jumps over the lazy dog",
                                 "abcdefghijklmnopqrstuvwxyzT"x100,
                                              1 ],
    [ "The quick brown fox jumps over the lazy dog",
                                 "abcdefghijklmnopqrstuvwxyzTU"x100,
                                              0 ],
    );

    # Long sentence, long wantedChars
    my @longLongTestCases = (
    [ "The quick brown fox jumps over the lazy dog" x 100,
                                 "abcdefghijklmnopqrstuvwxyzT"x100,
                                              1 ],
    [ "The quick brown fox jumps over the lazy dog" x 100,
                                 "abcdefghijklmnopqrstuvwxyzTU"x100,
                                              0 ],
    );
;

sub benchmark_routine
{
    my ($testFn, $testName, $testCases)=@_;

    foreach(@$testCases) {
        my ($sentence, $wantedLetters, $expectedResult) = @$_;
        die "$testName test failed ($sentence, $wantedLetters)"
          unless (($testFn->($sentence, $wantedLetters))==$expectedRes
+ult);
    }
}


# [id://707122]
sub tallulah_OriginalPost
{
    my ($sentence, $wantedLetters)=@_;
    my $flag=0;
    my @a = split '',$wantedLetters;
    for( my $i=0; $i<$#a+1; $i++ ) {
        if($sentence !~ /$a[$i]/) {
            $flag=1;last;
        }
    }

    return !$flag;
}

# [id://707123]
sub moritz_BuildRegex
{
    my ($sentence, $wantedLetters)=@_;

    my $re = '^' . join '', map "(?=.*?$_)", map quotemeta, split m//,
             $wantedLetters;

    if ($sentence =~ m/$re/) {
        return 1;
    }

    return 0;
}

# [id://707123]
sub moritz_BuildRegex_WithStudy
{
    my ($sentence, $wantedLetters)=@_;

    my $re = '^' . join '', map "(?=.*?$_)", map quotemeta, split m//,
             $wantedLetters;

    study $sentence;
    if ($sentence =~ m/$re/) {
        return 1;
    }

    return 0;
}

# [id://707124]
sub RMGir_index {
    my ($sentence, $wantedLetters)=@_;

    # don't need this variable (or any of them, in
    # fact -- they're just here for clarity.
    # we could work straight out of @_ if we wanted
    # this terser
    # Also, the $[ check is just pedantic - if someone
    # changes $[, shoot them.
    my $foundLetters=scalar (grep index($sentence,$_)>=$[,
                                  split //,$wantedLetters);

    return length($wantedLetters)==$foundLetters;
}

# [id://707222]
sub Tanktalus_AllRegex {
    my ($sentence, $letters) = @_;

    return 1 unless length($letters);

    # all we're doing is checking for each letter.
    all { $sentence =~ $_ } split //, $letters;
}

# [id://707222]
sub Tanktalus_AllRegex_Study {
    my ($sentence, $letters) = @_;

    return 1 unless length($letters);

    study $sentence;

    # all we're doing is checking for each letter.
    all { $sentence =~ $_ } split //, $letters;

    # same as above, but with index which I think is less readable.
    #all { index($sentence, $_) >= $[ } split //, $letters;
}

# [id://707222]
sub Tanktalus_AllIndex {
    my ($sentence, $letters) = @_;

    return 1 unless length($letters);

    # same as above, but with index which I think is less readable.
    all { index($sentence, $_) >= $[ } split //, $letters;
}

# JavaFan's looks about equivalent to OP approach

# Doesn't have same repeated letter semantics specified in
# OP post.
# [id://707176]
sub oshalla_scan {
    my ($sentence, $wanted) = @_ ;

    while (length($wanted)) {
        return 0 if ($sentence !~ m/([$wanted])/g) ;
        $wanted =~ s/$1// ;
    } ;

    return 1;
}

# [id://707231]
sub varian_hash {
    my ($sentence, $wantedLetters)=@_;

    my %required = map {$_ => 1} split //,$wantedLetters;
    map delete $required{$_}, split //, $sentence;

    if (keys %required) {
        return 0;
    }
    else {
        return 1;
    }
}

# [id://707314]
sub RMGir_slice {
    my ($sentence, $wantedLetters)=@_;

    my %required;
    @required{split //,$wantedLetters}=();
    delete @required{split //, $sentence};

    if (keys %required) {
        return 0;
    }
    else {
        return 1;
    }
}

my $testsRef = \@shortTestCases;
print "Short cases\n";
cmpthese(-1,
    {
        "tallulah_OriginalPost",
        sub { benchmark_routine(\&tallulah_OriginalPost,
                          "tallulah_OriginalPost", $testsRef) },
        "moritz_BuildRegex",
        sub { benchmark_routine(\&moritz_BuildRegex,
                          "moritz_BuildRegex", $testsRef) },
        "moritz_BuildRegex_WithStudy",
        sub { benchmark_routine(\&moritz_BuildRegex_WithStudy,
                          "moritz_BuildRegex_WithStudy", $testsRef) },
        "RMGir_index",
        sub { benchmark_routine(\&RMGir_index,
                          "RMGir_index", $testsRef) },
        "Tanktalus_AllRegex",
        sub { benchmark_routine(\&Tanktalus_AllRegex,
                          "Tanktalus_AllRegex", $testsRef) },
        "Tanktalus_AllRegex_Study",
        sub { benchmark_routine(\&Tanktalus_AllRegex_Study,
                          "Tanktalus_AllRegex_Study", $testsRef) },
        "Tanktalus_AllIndex ",
        sub { benchmark_routine(\&Tanktalus_AllIndex,
                          "Tanktalus_AllIndex", $testsRef) },
        # Doesn't have same repeated letter semantics specified in
        # OP post.
        #"oshalla_scan ",
        #sub { benchmark_routine(\&oshalla_scan,
        #                  "oshalla_scan", $testsRef) },
        "varian_hash ",
        sub { benchmark_routine(\&varian_hash,
                          "varian_hash", $testsRef) },
        "RMGir_slice ",
        sub { benchmark_routine(\&RMGir_slice,
                          "RMGir_slice", $testsRef) },
    }
);


$testsRef = \@longShortTestCases;
print "LongShort cases\n";
cmpthese(-1,
    {
        "tallulah_OriginalPost",
        sub { benchmark_routine(\&tallulah_OriginalPost,
                          "tallulah_OriginalPost", $testsRef) },
        "moritz_BuildRegex",
        sub { benchmark_routine(\&moritz_BuildRegex,
                          "moritz_BuildRegex", $testsRef) },
        "moritz_BuildRegex_WithStudy",
        sub { benchmark_routine(\&moritz_BuildRegex_WithStudy,
                          "moritz_BuildRegex_WithStudy", $testsRef) },
        "RMGir_index",
        sub { benchmark_routine(\&RMGir_index,
                          "RMGir_index", $testsRef) },
        "Tanktalus_AllRegex",
        sub { benchmark_routine(\&Tanktalus_AllRegex,
                          "Tanktalus_AllRegex", $testsRef) },
        "Tanktalus_AllRegex_Study",
        sub { benchmark_routine(\&Tanktalus_AllRegex_Study,
                          "Tanktalus_AllRegex_Study", $testsRef) },
        "Tanktalus_AllIndex ",
        sub { benchmark_routine(\&Tanktalus_AllIndex,
                          "Tanktalus_AllIndex", $testsRef) },
        # Doesn't have same repeated letter semantics specified in
        # OP post.
        #"oshalla_scan ",
        #sub { benchmark_routine(\&oshalla_scan,
        #                  "oshalla_scan", $testsRef) },
        "varian_hash ",
        sub { benchmark_routine(\&varian_hash,
                          "varian_hash", $testsRef) },
        "RMGir_slice ",
        sub { benchmark_routine(\&RMGir_slice,
                          "RMGir_slice", $testsRef) },
    }
);

$testsRef = \@shortLongTestCases;
print "ShortLong cases\n";
cmpthese(-1,
    {
        "tallulah_OriginalPost",
        sub { benchmark_routine(\&tallulah_OriginalPost,
                          "tallulah_OriginalPost", $testsRef) },
        "moritz_BuildRegex",
        sub { benchmark_routine(\&moritz_BuildRegex,
                          "moritz_BuildRegex", $testsRef) },
        "moritz_BuildRegex_WithStudy",
        sub { benchmark_routine(\&moritz_BuildRegex_WithStudy,
                          "moritz_BuildRegex_WithStudy", $testsRef) },
        "RMGir_index",
        sub { benchmark_routine(\&RMGir_index,
                          "RMGir_index", $testsRef) },
        "Tanktalus_AllRegex",
        sub { benchmark_routine(\&Tanktalus_AllRegex,
                          "Tanktalus_AllRegex", $testsRef) },
        "Tanktalus_AllRegex_Study",
        sub { benchmark_routine(\&Tanktalus_AllRegex_Study,
                          "Tanktalus_AllRegex_Study", $testsRef) },
        "Tanktalus_AllIndex ",
        sub { benchmark_routine(\&Tanktalus_AllIndex,
                          "Tanktalus_AllIndex", $testsRef) },
        # Doesn't have same repeated letter semantics specified in
        # OP post.
        #"oshalla_scan ",
        #sub { benchmark_routine(\&oshalla_scan,
        #                  "oshalla_scan", $testsRef) },
        "varian_hash ",
        sub { benchmark_routine(\&varian_hash,
                          "varian_hash", $testsRef) },
        "RMGir_slice ",
        sub { benchmark_routine(\&RMGir_slice,
                          "RMGir_slice", $testsRef) },
    }
);

$testsRef = \@longLongTestCases;
print "LongLong cases\n";
cmpthese(-1,
    {
        "tallulah_OriginalPost",
        sub { benchmark_routine(\&tallulah_OriginalPost,
                          "tallulah_OriginalPost", $testsRef) },
        "moritz_BuildRegex",
        sub { benchmark_routine(\&moritz_BuildRegex,
                          "moritz_BuildRegex", $testsRef) },
        "moritz_BuildRegex_WithStudy",
        sub { benchmark_routine(\&moritz_BuildRegex_WithStudy,
                          "moritz_BuildRegex_WithStudy", $testsRef) },
        "RMGir_index",
        sub { benchmark_routine(\&RMGir_index,
                          "RMGir_index", $testsRef) },
        "Tanktalus_AllRegex",
        sub { benchmark_routine(\&Tanktalus_AllRegex,
                          "Tanktalus_AllRegex", $testsRef) },
        "Tanktalus_AllRegex_Study",
        sub { benchmark_routine(\&Tanktalus_AllRegex_Study,
                          "Tanktalus_AllRegex_Study", $testsRef) },
        "Tanktalus_AllIndex ",
        sub { benchmark_routine(\&Tanktalus_AllIndex,
                          "Tanktalus_AllIndex", $testsRef) },
        # Doesn't have same repeated letter semantics specified in
        # OP post.
        #"oshalla_scan ",
        #sub { benchmark_routine(\&oshalla_scan,
        #                  "oshalla_scan", $testsRef) },
        "varian_hash ",
        sub { benchmark_routine(\&varian_hash,
                          "varian_hash", $testsRef) },
        "RMGir_slice ",
        sub { benchmark_routine(\&RMGir_slice,
                          "RMGir_slice", $testsRef) },
    }
);
[download]

Mike

Comment on Re^2: Benchmarking "Are all these characters in this sentence?" Download Code