I tried a method (&johngg) that used a global match to build a list of offsets and lengths of characters to discard and then used substr, processing the list in reverse order, i.e. from the end of the string back towards the front to avoid screwing up the offsets. This approach was a middling performer in the benchmarks.
I then refined it (&johngg2) to work directly rather than building a list by matching against the reversed string. This seems to be a bit faster, placing third on long strings and coming out top with short ones.
use warnings; use strict; use Benchmark qw(cmpthese); my ($ikegamiRe1) = map qr/$_/, join '|', map "(?<=${_}{3})$_+", map qu +otemeta, 1 .. 9; my ($ikegamiReA) = map qr/$_/, join '|', map "(?<=${_}{3})$_+", map qu +otemeta, (1 .. 9, 'a' .. 'z', 'A' .. 'Z'); my $str = join '', map { $_ x $_ } 1 .. 9; $str = $str x 20; print "The long:\n"; cmpthese ( -1, { orig => \&orig, ccn => \&ccn, kvale => \&kvale, johngg => \&johngg, johngg2 => \&johngg2, JavaFan => \&JavaFan, GF => \&GF, ikegami1 => \&ikegami1, ikegami2 => \&ikegami2, ikegami2A => \&ikegami2A, ikegami2x => \&ikegami2x, ikegami3 => \&ikegami3, mrm_s => \&mrm_s_const, mrm_l => \&mrm_l_const, } ); $str = substr $str, 3, 7; print "the short:\n"; cmpthese ( -1, { orig => \&orig, ccn => \&ccn, kvale => \&kvale, johngg => \&johngg, johngg2 => \&johngg2, JavaFan => \&JavaFan, GF => \&GF, ikegami1 => \&ikegami1, ikegami2 => \&ikegami2, ikegami2A => \&ikegami2A, ikegami2x => \&ikegami2x, ikegami3 => \&ikegami3, mrm_s => \&mrm_s_const, mrm_l => \&mrm_l_const, } ); print "and the check:\n"; for my $func (qw( orig ccn kvale johngg johngg2 JavaFan GF ikegami1 ikegami2 ikegami2A ikegami2x ikegami3 mrm_s_const mrm_l_const )) { my $funcRef = \&{"$func"}; &$funcRef; printf "%12s: %s\n", $func, $_; } sub orig { $_ = $str; s/(.)(?=\1\1\1)//gs; } sub ccn { $_ = $str; s/(.)\1{3,}/$1$1$1/gs; } sub kvale { $_ = $str; s/(.)\1{2,}/$1$1$1/g; } sub johngg { my $newStr = $str; my @sequences; while( $newStr =~ m{((.)\2{3,})}g ) { my $charsToLose = length( $1 ) - 3; push @sequences, [ pos( $newStr ) - $charsToLose, $charsToLose + ]; } substr $newStr, $_->[ 0 ], $_->[ 1 ], q{} for reverse @sequences; return $newStr; } sub johngg2 { my $newStr = $str; my $revStr = reverse $str; my $length = length $newStr; substr $newStr, $length - pos( $revStr ) + 3, length( $1 ) - 3, q{} while $revStr =~ m{((.)\2{3,})}g; return $newStr; } sub JavaFan { $_ = $str; s/(.)\1{2,}/$1$1$1/g; } sub GF { $_ = $str; s/((.)\2\2)\2+/$1/g; } sub ikegami1 { $_ = $str; s/$ikegamiRe1/substr($_,$-[0],3)/eg; } sub ikegami2 { $_ = $str; s/$ikegamiRe1//g; } sub ikegami2A { $_ = $str; s/$ikegamiReA//g; } sub ikegami2x { my ($re) = map qr/$_/, join '|', map "(?<=${_}{3})$_+", map quotem +eta, 1 .. 9; $_ = $str; s/$re//g; } sub ikegami3 { $_ = $str; s/($ikegamiRe1)/substr($1,0,3)/eg; } sub mrm_long { my $len = length $_[1]; ### loop prelude my $char = substr $_[1], 0, 1; my $count = 1; my $new_string = $char; my $at = 1; while ( $at < $len ) { my $old_char = $char; $char = substr $_[1], $at++, 1; if ( $char eq $old_char ) { next if $count >= $_[0]; $count++; } else { $count = 1; } $new_string .= $char; } return $new_string; } sub mrm_short { ( my $new_string = $_[1] ) =~ s/(.)\1{$_[0],}/$1 x $_[0]/eg; return $new_string; } sub mrm_s_const { ( my $new_string = $str ) =~ s/(.)\1{3,}/$1 x 3/eg; return $new_string; } sub mrm_l_const { my $len = length $str; ### loop prelude my $char = substr $str, 0, 1; my $count = 1; my $new_string = $char; my $at = 1; while ( $at < $len ) { my $old_char = $char; $char = substr $str, $at++, 1; if ( $char eq $old_char ) { next if $count >= 3; $count++; } else { $count = 1; } $new_string .= $char; } return $new_string; }
The benchmark output.
The long: Rate ikegami2A ikegami1 mrm_l ikegami3 ikegami2x ikegami2 +johngg orig kvale JavaFan ccn johngg2 mrm_s GF ikegami2A 185/s -- -74% -75% -77% -79% -80% + -87% -91% -91% -91% -92% -93% -94% -94% ikegami1 717/s 286% -- -2% -10% -18% -21% + -48% -65% -67% -67% -69% -72% -76% -78% mrm_l 730/s 294% 2% -- -9% -17% -20% + -47% -65% -66% -66% -68% -71% -75% -78% ikegami3 800/s 331% 12% 10% -- -9% -12% + -42% -61% -63% -63% -65% -69% -73% -75% ikegami2x 878/s 373% 23% 20% 10% -- -4% + -36% -58% -59% -59% -62% -65% -70% -73% ikegami2 913/s 392% 27% 25% 14% 4% -- + -34% -56% -58% -58% -60% -64% -69% -72% johngg 1380/s 644% 93% 89% 72% 57% 51% + -- -33% -36% -36% -40% -46% -53% -58% orig 2069/s 1016% 189% 183% 159% 136% 127% + 50% -- -4% -4% -10% -19% -30% -36% kvale 2160/s 1064% 201% 196% 170% 146% 137% + 57% 4% -- -0% -6% -15% -27% -34% JavaFan 2160/s 1064% 201% 196% 170% 146% 137% + 57% 4% 0% -- -6% -15% -27% -34% ccn 2303/s 1141% 221% 215% 188% 162% 152% + 67% 11% 7% 7% -- -9% -22% -29% johngg2 2544/s 1272% 255% 248% 218% 190% 179% + 84% 23% 18% 18% 10% -- -14% -22% mrm_s 2950/s 1490% 312% 304% 269% 236% 223% + 114% 43% 37% 37% 28% 16% -- -9% GF 3251/s 1653% 354% 345% 306% 270% 256% + 136% 57% 51% 51% 41% 28% 10% -- the short: Rate ikegami2x ikegami2A ikegami1 ikegami3 ikegami2 Java +Fan kvale ccn johngg mrm_l GF mrm_s orig johngg2 ikegami2x 15890/s -- -12% -62% -67% -72% - +77% -77% -79% -80% -82% -82% -82% -86% -88% ikegami2A 18088/s 14% -- -57% -63% -68% - +74% -74% -76% -78% -79% -79% -80% -84% -87% ikegami1 42081/s 165% 133% -- -13% -25% - +40% -40% -45% -48% -51% -51% -53% -62% -69% ikegami3 48558/s 206% 168% 15% -- -13% - +30% -30% -37% -40% -43% -44% -46% -56% -64% ikegami2 56109/s 253% 210% 33% 16% -- - +19% -19% -27% -31% -35% -35% -37% -50% -58% JavaFan 69591/s 338% 285% 65% 43% 24% + -- -0% -9% -14% -19% -19% -22% -38% -48% kvale 69625/s 338% 285% 65% 43% 24% + 0% -- -9% -14% -19% -19% -22% -38% -48% ccn 76588/s 382% 323% 82% 58% 36% +10% 10% -- -5% -11% -11% -14% -31% -43% johngg 81031/s 410% 348% 93% 67% 44% +16% 16% 6% -- -6% -6% -9% -27% -40% mrm_l 85929/s 441% 375% 104% 77% 53% +23% 23% 12% 6% -- -0% -4% -23% -36% GF 86067/s 442% 376% 105% 77% 53% +24% 24% 12% 6% 0% -- -3% -23% -36% mrm_s 89154/s 461% 393% 112% 84% 59% +28% 28% 16% 10% 4% 4% -- -20% -34% orig 111505/s 602% 516% 165% 130% 99% +60% 60% 46% 38% 30% 30% 25% -- -17% johngg2 134990/s 750% 646% 221% 178% 141% +94% 94% 76% 67% 57% 57% 51% 21% -- and the check: orig: 333444 ccn: 333444 kvale: 333444 johngg: 333444 johngg2: 333444 JavaFan: 333444 GF: 333444 ikegami1: 3334444 ikegami2: 333444 ikegami2A: 333444 ikegami2x: 333444 ikegami3: 3334444 mrm_s_const: 3334444 mrm_l_const: 3334444
I hope this is of interest.
Cheers,
JohnGG
In reply to Re^2: More efficient way to truncate long strings of the same character
by johngg
in thread More efficient way to truncate long strings of the same character
by kurtis
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |