#! perl -slw use strict; our $L ||= 200; our $SEED ||= 1; our $RANGE ||= "'a'..'c'"; #srand $SEED; my @chars = eval $RANGE; my $s = join'', map{ ( @chars )[ rand @chars ] } 1 .. $L; my @rls; use constant { BASESEQ => 0, SAVED => 1, OFFSET => 2, LENGTH => 3, REPEATS => 4 }; ## Find runs an record them push @rls, [ $1, length $2, pos($s) - length( $1 ), length( $1 ) + length( $2 ), length( $2 ) / length( $1 ), ] while $s =~ m[(.+)(?=(\1{1,254}))]g; ## Eliminate overlaps deleting the ## lesser saving in each case. for my $p ( 0 .. $#rls ) { next unless defined $rls[ $p ]; for my $q ( $p+1 .. $#rls ) { next unless defined $rls[ $p ] and defined $rls[ $q ]; last if $rls[ $q ][ OFFSET ] > $rls[ $p ][ OFFSET ] + $rls[ $p ][ LENGTH ]; $rls[ $q ][ OFFSET ] >= $rls[ $p ][ OFFSET ] && $rls[ $q ][ OFFSET ] < $rls[ $p ][ OFFSET ] + $rls[ $p ][ LENGTH ] and delete $rls[ $rls[ $p ][ SAVED ] < $rls[ $q ][ SAVED ] ? $p : $q ]; } } ## Remove the overlaps @rls = grep defined, @rls; ## Display the results print $s; print ' ' x $_->[ OFFSET ], uc( $_->[ BASESEQ ] ), $_->[ BASESEQ ] x $_->[ REPEATS ] for @rls; __END__ #### [ 2:44:41.87] P:\test>390333 -L=70 -RANGE='a'..'b' abaabbaaaabbaaaabaabaabaaaababaababbbbbaababbaabbaaaaabbbbaabbaababbba AABBAAaabbaa AABaabaab AABABaabab Bbb ABBAabba Aaa BBAAbbaa Bbb [ 2:44:42.69] P:\test>390333 -L=70 -RANGE='a'..'b' abbbaaabbaabbbbbbbbbaaabbaaaaabbbabaaaabaaababaababbabbaaaaaabbbbbabbb Bbb AABBaabb BBAAAbbaaa Aa Bbb AABABaabab Bb AAAaaa BBbb Bbb [ 2:44:43.55] P:\test>390333 -L=70 -RANGE='a'..'b' aabbbaaaabaabbbbbbbabaaabaaaaaabaabbbabaabbaabbaababbaaabaabaaabbababb Aa Bbb AABaab BBbb BAAAbaaa AABaab BAABbaabbaab Bb AABaab Aaa BAba Bb [ 2:44:44.37] P:\test>390333 -L=70 -RANGE='a'..'b' aabbbbaaabbbabababbbabbbbbababababbaabaabaaaabaababbaabbabbabbbbaabbbb Aa BBbb Aaa BAbaba BBbb BABAbaba BAAbaabaa AABaab ABBabbabb Bb Aa BBbb #### #! perl -slw use strict; use constant { BASESEQ => 0, SAVED => 1, OFFSET => 2, LENGTH => 3, REPEATS => 4 }; our $L ||= 200; our $RANGE ||= "'a'..'c'"; #srand $SEED if $SEED; sub findRuns { my $s = shift; my @rls; push @rls, [ $1, length $2, pos($s) - length( $1 ), length( $1 ) + length( $2 ), length( $2 ) / length( $1 ) + 1, ] while $s =~ m[ .*? ( [a-z]+ ) (?= ( \1{1,254} ) [^)]* (?: \( | $ ) ) ]gx; for my $p ( 0 .. $#rls ) { next unless defined $rls[ $p ]; for my $q ( $p+1 .. $#rls ) { next unless defined $rls[ $p ] and defined $rls[ $q ]; last if $rls[ $q ][ OFFSET ] > $rls[ $p ][ OFFSET ] + $rls[ $p ][ LENGTH ]; while( $rls[ $q ][ OFFSET ] >= $rls[ $p ][ OFFSET ] && $rls[ $q ][ OFFSET ] < $rls[ $p ][ OFFSET ] + $rls[ $p ][ LENGTH ] ) { if( $rls[ $p ][ SAVED ] < $rls[ $q ][ SAVED ] ) { last unless $rls[ $p ][ REPEATS ]-- > 2; $rls[ $p ][ REPEATS ] -= length $rls[ $p ][ BASESEQ ]; } else { last unless $rls[ $q ][ REPEATS ]-- > 2; $rls[ $q ][ REPEATS ] -= length $rls[ $q ][ BASESEQ ]; $rls[ $q ][ OFFSET ] += length $rls[ $q ][ BASESEQ ]; } } $rls[ $q ][ OFFSET ] >= $rls[ $p ][ OFFSET ] && $rls[ $q ][ OFFSET ] < $rls[ $p ][ OFFSET ] + $rls[ $p ][ LENGTH ] and delete $rls[ $rls[ $p ][ SAVED ] < $rls[ $q ][ SAVED ] ? $p : $q ]; } } @rls = grep defined, @rls; for ( @rls ) { my $re = qr[(?:$_->[ BASESEQ ]){$_->[ &REPEATS ]}]; $s =~ s[ ( (?: ^ | \) ) [^(]*? ) $re] { ($1||'') . '(' . $_->[ REPEATS ] . ":$_->[ BASESEQ ])" }ex; } $s = findRuns( $s ) if $s =~ m[ (?: ^ | \) ) [^(]*? ( [a-z]+ ) \1 [^)]*? (?: \( | $ ) ]x; return $s; } my @chars = eval $RANGE; my $s = join'', map{ ( @chars )[ rand @chars ] } 1 .. $L; print scalar localtime, ' : ', $s, $/; my $encoded = findRuns $s; print scalar localtime, ' : ', $encoded, $/; $encoded =~ s[\((\d+):([^)]+)\)][$2 x $1]eg; print scalar localtime, ' : ', $encoded, $/; #### [ 6:10:08.93] P:\test>390333 -L=70 -RANGE='a'..'b' Sun Sep 12 06:11:42 2004 : aabbbabababaaabaaaabababbabaaabaaabaabbabbbabababaabbaaabbaabbbabaaaab Sun Sep 12 06:11:42 2004 : (2:a)(2:b)(2:baba)(2:a)b(3:a)(3:ab)ba(2:baaa)ba(2:abb)(3:ba)b(2:aabba)a(3:b)ab(2:aa)b Sun Sep 12 06:11:42 2004 : aabbbabababaaabaaaabababbabaaabaaabaabbabbbabababaabbaaabbaabbbabaaaab [ 6:11:42.57] P:\test>390333 -L=70 -RANGE='a'..'b' Sun Sep 12 06:11:46 2004 : bbaaaabbabbbbababaabbbaaababbababbbbbbbabbbbabaaaabaababaaaabbaabbbaaa Sun Sep 12 06:11:46 2004 : (2:b)(3:a)(2:abb)b(3:ba)a(3:b)(2:a)(2:ababb)b(2:bbbba)b(2:a)(2:aab)ab(2:a)(2:aabb)b(3:a) Sun Sep 12 06:11:46 2004 : bbaaaabbabbbbababaabbbaaababbababbbbbbbabbbbabaaaabaababaaaabbaabbbaaa