use strict;
use warnings;
use Benchmark qw/ cmpthese /;
my $R = 'R' x 10;
my $Q = 'Q' x 10;
use re 'debug';
sub foo {
my $c = shift;
pos $$c = 0;
while ( 1 ) {
next if $$c =~ /\G\d+R/;
last unless $$c =~ /\w/g;
}
}
print "-" x 80, "\n";
foo( \$Q );
print "-" x 80, "\n";
foo( \$R );
print "-" x 80, "\n";
Matching REx "\G\d+R" against "QQQQQQQQQQ"
Intuit: trying to determine minimum start position...
Did not find floating substr "R"...
Match rejected by optimizer
Matching REx "\w" against "QQQQQQQQQQ"
Matching stclass POSIXD[\w] against "QQQQQQQQQQ" (10 bytes)
0 <> <QQQQQQQQQQ> | 1:POSIXD[\w](2)
1 <Q> <QQQQQQQQQ> | 2:END(0)
Match successful!
...
Matching REx "\G\d+R" against "QQQQQQQQQQ"
Regex match can't succeed, so not even tried
Matching REx "\w" against "Q"
Matching stclass POSIXD[\w] against "Q" (1 bytes)
9 <QQQQQQQQQ> <Q> | 1:POSIXD[\w](2)
10 <QQQQQQQQQQ> <> | 2:END(0)
Match successful!
Matching REx "\G\d+R" against "QQQQQQQQQQ"
Regex match can't succeed, so not even tried
Matching REx "\w" against ""
Regex match can't succeed, so not even tried
----------------------------------------------------
----------------------------------------------------------------------
+----------
Matching REx "\G\d+R" against "RRRRRRRRRR"
Intuit: trying to determine minimum start position...
Found floating substr "R" at offset 1...
(multiline anchor test skipped)
looking for class: start_shift: 1 check_at: 1 rx_origin: 0 endpos: 1
This position contradicts STCLASS...
Match rejected by optimizer
Matching REx "\w" against "RRRRRRRRRR"
Matching stclass POSIXD[\w] against "RRRRRRRRRR" (10 bytes)
0 <> <RRRRRRRRRR> | 1:POSIXD[\w](2)
1 <R> <RRRRRRRRR> | 2:END(0)
Match successful!
Matching REx "\G\d+R" against "RRRRRRRRRR"
Intuit: trying to determine minimum start position...
Found floating substr "R" at offset 1...
(multiline anchor test skipped)
looking for class: start_shift: 1 check_at: 2 rx_origin: 1 endpos: 2
This position contradicts STCLASS...
Match rejected by optimizer
Matching REx "\w" against "RRRRRRRRR"
Matching stclass POSIXD[\w] against "RRRRRRRRR" (9 bytes)
1 <R> <RRRRRRRRR> | 1:POSIXD[\w](2)
2 <RR> <RRRRRRRR> | 2:END(0)
Match successful!
...
I'm not sure how you could make Perl try not to scan the string for the fixed substring R but immediately look for \G\d to immediately reject the parse at that location, no matter what follows.
By splitting up the parsing between \d+ and R, I can make both cases behave slightly 18% worse than \G\d+R, but I'm not sure if that's correct:
use strict;
use warnings;
use Benchmark qw/ cmpthese /;
my $R = 'R' x 42_000;
my $Q = 'Q' x 42_000;
sub foo {
my $c = shift;
pos $$c = 0;
while ( 1 ) {
next if $$c =~ /\G\d+R/;
last unless $$c =~ /\w/g;
}
}
sub foo_twostep {
my $c = shift;
pos $$c = 0;
while ( 1 ) {
next if ($$c =~ /\G[0-9]+/ and $$c =~ /\GR/);
last unless $$c =~ /\w/g;
}
}
cmpthese -3, {
R => sub { foo( \$R )},
Q => sub { foo( \$Q )},
R_twostep => sub { foo_twostep( \$R )},
Q_twostep => sub { foo_twostep( \$Q )},
}
__END__
Rate Q R_twostep Q_twostep R
Q 2.01/s -- -97% -97% -97%
R_twostep 65.3/s 3147% -- 0% -15%
Q_twostep 65.3/s 3147% 0% -- -15%
R 77.1/s 3732% 18% 18% --
Update: Using only ASCII digits, I can get a slowdown of 12%:
use strict;
use warnings;
use Benchmark qw/ cmpthese /;
my $R = 'R' x 42_000;
my $Q = 'Q' x 42_000;
sub foo {
my $c = shift;
pos $$c = 0;
while ( 1 ) {
next if $$c =~ /\G\d+R/;
last unless $$c =~ /\w/g;
}
}
sub foo_twostep {
my $c = shift;
pos $$c = 0;
while ( 1 ) {
next if ($$c =~ /\G\d+/ and $$c =~ /\GR/);
last unless $$c =~ /\w/g;
}
}
sub foo_asciidigits {
my $c = shift;
pos $$c = 0;
while ( 1 ) {
next if ($$c =~ /\G[0-9]/ and $$c =~ /\GR/);
last unless $$c =~ /\w/g;
}
}
cmpthese -3, {
R => sub { foo( \$R )},
Q => sub { foo( \$Q )},
R_twostep => sub { foo_twostep( \$R )},
Q_twostep => sub { foo_twostep( \$Q )},
R_ascii => sub { foo_asciidigits( \$R )},
Q_ascii => sub { foo_asciidigits( \$Q )},
}
__END__
Rate Q Q_twostep R_twostep R_ascii Q_ascii
+ R
Q 1.99/s -- -97% -97% -97% -97%
+ -97%
Q_twostep 62.9/s 3054% -- -1% -7% -8%
+ -17%
R_twostep 63.8/s 3101% 1% -- -5% -6%
+ -16%
R_ascii 67.5/s 3287% 7% 6% -- -1%
+ -11%
Q_ascii 68.1/s 3319% 8% 7% 1% --
+ -10%
R 76.0/s 3715% 21% 19% 13% 12%
+ --
|