in reply to More efficient way to truncate long strings of the same character
Benchmarks are fun and very often not worth writing. In this case it turns out the OP's "slow" regex ain't so bad:
The long: Rate ikegami2A mrm_l ikegami1 ikegami3 ikegami2x ikegami2 +JavaFan kvale ccn mrm_s orig GF ikegami2A 693/s -- -60% -66% -69% -76% -76% + -84% -84% -85% -89% -90% -90% mrm_l 1717/s 148% -- -15% -24% -40% -42% + -61% -62% -63% -72% -74% -75% ikegami1 2023/s 192% 18% -- -10% -29% -31% + -54% -55% -56% -67% -70% -71% ikegami3 2256/s 226% 31% 12% -- -21% -23% + -49% -49% -51% -63% -66% -67% ikegami2x 2856/s 312% 66% 41% 27% -- -3% + -36% -36% -38% -53% -57% -59% ikegami2 2940/s 324% 71% 45% 30% 3% -- + -34% -34% -36% -52% -56% -58% JavaFan 4443/s 542% 159% 120% 97% 56% 51% + -- -1% -3% -27% -34% -36% kvale 4467/s 545% 160% 121% 98% 56% 52% + 1% -- -2% -27% -33% -35% ccn 4581/s 562% 167% 127% 103% 60% 56% + 3% 3% -- -25% -32% -34% mrm_s 6087/s 779% 254% 201% 170% 113% 107% + 37% 36% 33% -- -9% -12% orig 6689/s 866% 290% 231% 196% 134% 128% + 51% 50% 46% 10% -- -3% GF 6919/s 899% 303% 242% 207% 142% 135% + 56% 55% 51% 14% 3% -- the short: Rate ikegami2x ikegami2A ikegami1 ikegami3 JavaFan mrm_l + kvale ccn ikegami2 mrm_s GF orig ikegami2x 54274/s -- -26% -64% -64% -71% -71% + -71% -75% -78% -78% -78% -84% ikegami2A 73135/s 35% -- -52% -52% -61% -61% + -61% -66% -70% -70% -71% -79% ikegami1 151690/s 179% 107% -- -0% -18% -19% + -19% -30% -38% -38% -39% -56% ikegami3 151690/s 179% 107% 0% -- -18% -19% + -19% -30% -38% -38% -39% -56% JavaFan 185391/s 242% 153% 22% 22% -- -1% + -2% -15% -24% -25% -26% -46% mrm_l 188182/s 247% 157% 24% 24% 2% -- + -0% -13% -22% -24% -25% -46% kvale 188359/s 247% 158% 24% 24% 2% 0% + -- -13% -22% -24% -25% -46% ccn 217080/s 300% 197% 43% 43% 17% 15% + 15% -- -11% -12% -13% -37% ikegami2 242794/s 347% 232% 60% 60% 31% 29% + 29% 12% -- -1% -3% -30% mrm_s 246469/s 354% 237% 62% 62% 33% 31% + 31% 14% 2% -- -1% -29% GF 249581/s 360% 241% 65% 65% 35% 33% + 33% 15% 3% 1% -- -28% orig 346085/s 538% 373% 128% 128% 87% 84% + 84% 59% 43% 40% 39% -- and the check: orig: 333444 ccn: 333444 kvale: 333444 JavaFan: 333444 GF: 333444 ikegami1: 3334444 ikegami2: 333444 ikegami2A: 333444 ikegami2x: 333444 ikegami3: 3334444 mrm_s_const: 3334444 mrm_l_const: 3334444
use warnings; use strict; use Benchmark qw(cmpthese); my ($ikegamiRe1) = map qr/$_/, join '|', map "(?<=${_}{3})$_+", map qu +otemeta, 1 .. 9; my ($ikegamiReA) = map qr/$_/, join '|', map "(?<=${_}{3})$_+", map qu +otemeta, (1 .. 9, 'a' .. 'z', 'A' .. 'Z'); my $str = join '', map { $_ x $_ } 1 .. 9; $str = $str x 20; print "The long:\n"; cmpthese ( -1, { orig => \&orig, ccn => \&ccn, kvale => \&kvale, JavaFan => \&JavaFan, GF => \&GF, ikegami1 => \&ikegami1, ikegami2 => \&ikegami2, ikegami2A => \&ikegami2A, ikegami2x => \&ikegami2x, ikegami3 => \&ikegami3, mrm_s => \&mrm_s_const, mrm_l => \&mrm_l_const, } ); $str = substr $str, 3, 7; print "the short:\n"; cmpthese ( -1, { orig => \&orig, ccn => \&ccn, kvale => \&kvale, JavaFan => \&JavaFan, GF => \&GF, ikegami1 => \&ikegami1, ikegami2 => \&ikegami2, ikegami2A => \&ikegami2A, ikegami2x => \&ikegami2x, ikegami3 => \&ikegami3, mrm_s => \&mrm_s_const, mrm_l => \&mrm_l_const, } ); print "and the check:\n"; for my $func (qw( orig ccn kvale JavaFan GF ikegami1 ikegami2 ikegami2A ikegami2 +x ikegami3 mrm_s_const mrm_l_const )) { my $funcRef = \&{"$func"}; &$funcRef; printf "%12s: %s\n", $func, $_; } sub orig { $_ = $str; s/(.)(?=\1\1\1)//gs; } sub ccn { $_ = $str; s/(.)\1{3,}/$1$1$1/gs; } sub kvale { $_ = $str; s/(.)\1{2,}/$1$1$1/g; } sub JavaFan { $_ = $str; s/(.)\1{2,}/$1$1$1/g; } sub GF { $_ = $str; s/((.)\2\2)\2+/$1/g; } sub ikegami1 { $_ = $str; s/$ikegamiRe1/substr($_,$-[0],3)/eg; } sub ikegami2 { $_ = $str; s/$ikegamiRe1//g; } sub ikegami2A { $_ = $str; s/$ikegamiReA//g; } sub ikegami2x { my ($re) = map qr/$_/, join '|', map "(?<=${_}{3})$_+", map quotem +eta, 1 .. 9; $_ = $str; s/$re//g; } sub ikegami3 { $_ = $str; s/($ikegamiRe1)/substr($1,0,3)/eg; } sub mrm_long { my $len = length $_[1]; ### loop prelude my $char = substr $_[1], 0, 1; my $count = 1; my $new_string = $char; my $at = 1; while ( $at < $len ) { my $old_char = $char; $char = substr $_[1], $at++, 1; if ( $char eq $old_char ) { next if $count >= $_[0]; $count++; } else { $count = 1; } $new_string .= $char; } return $new_string; } sub mrm_short { ( my $new_string = $_[1] ) =~ s/(.)\1{$_[0],}/$1 x $_[0]/eg; return $new_string; } sub mrm_s_const { ( my $new_string = $str ) =~ s/(.)\1{3,}/$1 x 3/eg; return $new_string; } sub mrm_l_const { my $len = length $str; ### loop prelude my $char = substr $str, 0, 1; my $count = 1; my $new_string = $char; my $at = 1; while ( $at < $len ) { my $old_char = $char; $char = substr $str, $at++, 1; if ( $char eq $old_char ) { next if $count >= 3; $count++; } else { $count = 1; } $new_string .= $char; } return $new_string; }
The ikegami digit versions use 1 .. 9. The 2A version uses digits plus lower and upper case letters. The 2x variant generates the expanded match regex for each iteration to represent a build per match cost.
Oh, and my regex? s/((.)\1{2})\1+/$2/g.
Update: Fixed GF regex as noted by johngg. Added the fixed length variant of mr_mischief solutions. Added a sanity check for the routines (which are interesting!) and regenerated the output (Perl 5.8.8 on XP).
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^2: More efficient way to truncate long strings of the same character
by johngg (Canon) on Oct 30, 2008 at 21:39 UTC | |
by mr_mischief (Monsignor) on Oct 30, 2008 at 23:37 UTC | |
|
Re^2: More efficient way to truncate long strings of the same character
by mr_mischief (Monsignor) on Oct 30, 2008 at 21:51 UTC | |
|
Re^2: More efficient way to truncate long strings of the same character
by johngg (Canon) on Oct 31, 2008 at 15:16 UTC | |
by mr_mischief (Monsignor) on Oct 31, 2008 at 21:30 UTC | |
|
Re^2: More efficient way to truncate long strings of the same character
by mr_mischief (Monsignor) on Oct 31, 2008 at 21:35 UTC | |
by GrandFather (Saint) on Oct 31, 2008 at 22:15 UTC |