#!/usr/bin/perl use warnings; use strict; use re 'eval'; use Time::HiRes qw( gettimeofday tv_interval ); my $t0 = [ gettimeofday ]; $\ = $/; my $debug = 0; my $debug2 = 0; my $time = 0; my @acgt = qw( a c g t ); my @combinations = map { [ combinations( $_, $_ - 1, @acgt ) ] } 1 .. 5; $debug and print ~~ @{ $_ } for @combinations; my %regexes = ( '_' => 1 ); while( 1 ){ my @old_regexes = sort { length $a <=> length $b || $a cmp $b } keys %regexes; $debug and print " " . @old_regexes; for my $i ( 0 .. @old_regexes - 1 ){ my $copy_of_index_i = $i; $i = $old_regexes[ $i ]; if( $i !~ /\*/ ){ $regexes{ "($i*)" } ++ if ( length $i ) + 3 < 16; for my $j ( $copy_of_index_i + 0 .. @old_regexes - 1 ){ $j = $old_regexes[ $j ]; last if ( length $i ) + ( length $j ) + 3 > 15; next if $j =~ /\*/; $regexes{ "($i|$j)" } ++; # $regexes{ "($j|$i)" } ++; } } for my $j ( $copy_of_index_i + 0 .. @old_regexes - 1 ){ $j = $old_regexes[ $j ]; last if ( length $i ) + ( length $j ) + 2 > 15; next if $i =~ /\*/ && $j =~ /\*/; $regexes{ "($i$j)" } ++; $regexes{ "($j$i)" } ++; } } last if @old_regexes == keys %regexes; } $debug2 and print "regexes(30): ", join ' ', ( sort { length $a <=> length $b } keys %regexes )[ 0 .. 30 ]; $debug and print "number of abstract regexes: " . keys %regexes; my %uniq; map $uniq{ $_ } ++, keys %regexes; %regexes = map { $_ => 1 } keys %uniq; $debug and print ~~ keys %regexes; for my $re ( sort keys %regexes ){ $re =~ / (\w) [(] (\w)(\w) [)] /x or next; delete $regexes{ $re }; $re =~ s/ (\w) [(] (\w)(\w) [)] /($1$2)$3/x; $regexes{ $re } ++; } $debug and print ~~ keys %regexes; for my $re ( sort keys %regexes ){ $re =~ / (\w) [(] [(] (\w)(\w) [)] (\w) [)] /x or next; delete $regexes{ $re }; $re =~ s/ (\w) [(] [(] (\w)(\w) [)] (\w) [)] /(($1$2)$3)$4/x; $regexes{ $re } ++; } $debug and print ~~ keys %regexes; for my $re ( sort keys %regexes ){ $re =~ / [(] (\w) (\w) [)] [(] (\w) (\w) [)] /x or next; delete $regexes{ $re }; $re =~ s/ [(] (\w) (\w) [)] [(] (\w) (\w) [)] /(($1$2)$3)$4/x; $regexes{ $re } ++; } $debug and print ~~ keys %regexes; $regexes{ "((((a|c)|g)|t)*)" } ++; $debug and print ~~ keys %regexes; my @regexes = sort { length $a <=> length $b } keys %regexes; y/_/./ for @regexes; $debug2 and print "regexes(30): @regexes[ 0 .. 30 ]"; %regexes = map { $_ => 1 } @regexes; my %HoR; for my $re ( keys %regexes ){ my $cnt = () = $re =~ /\./g; for my $comb ( @{ $combinations[ $cnt - 1 ] } ){ $_ = $re; for my $letter ( split //, $comb ){ s/\./$letter/; } next if /\*/ and 4 == $cnt and /(\w).*\1/; next if / [(] (\w) \| (\w) [)] /x and $1 ge $2; next if / [(] [(] (\w) \| (\w) [)] \| (\w) [)] /x and $1 ge $2 || $1 ge $3 || $2 ge $3; push @{ $HoR{ $re } }, qr/^$_$/; } } if( $debug ){ my @c = ( 0 ) x 5; my $c = 0; for my $re ( keys %HoR ){ my $cnt = () = $re =~ /\./g; $c[ $cnt - 1 ] += @{ $HoR{ $re } }; $c += @{ $HoR{ $re } }; } print "all regexes: " . $c; print " $_" for @c; } $time and print ' ', tv_interval( $t0 ); ################ MAIN: <>; while( <> ){ @_ = map ~~<>, 1 .. $_; chomp @_; s/\S+ // for @_; $debug2 and print "\@_:@_"; s/(.)\1{2,}/ $1 x 3 /ge for @_; s/(.{2,4})\1{2,}/ $1 x 2 /ge for @_; $time and print ' ', tv_interval( $t0 ); my %uniq; map $uniq{ $_ } ++, @_; @_ = sort keys %uniq; my @regexes = sort { length $a <=> length $b } keys %HoR; for my $str ( @_ ){ $debug and print " candidate abstract regexes: " . @regexes; $debug2 and print " str:[$str]"; my @ok; for my $re ( @regexes ){ $str =~ /^$re$/ and push @ok, $re; } @regexes = @ok; } $debug and print " abstract regexes which match: " . @regexes; $debug2 and print " abstract regexes: @regexes"; my @real; @regexes = map @{ $HoR{ $_ } }, @regexes; for my $str ( @_ ){ $debug and print " candidate regexes: " . @regexes; $debug2 and print " str:[$str]"; my @ok; for my $re ( @regexes ){ $str =~ $re and push @ok, $re; # HOT SPOT } @regexes = @ok; } $debug and print " regexes which match: " . @regexes; my $shortest = shift @regexes; my $strip = join "(.*)", map quotemeta, qw[ (?^:^ $) ]; map { s/^ $strip $/$1/x } $shortest; #^ map { s/^ \Q(?^:^\E //x, s/ \Q$)\E $//x } $shortest; # alternative: OK print $shortest ? $shortest : "Impossible"; $time and print tv_interval( $t0 ); $debug and print '-' x 20; } ########## END MAIN sub combinations { my( $length, $min_set, @letters ) = @_; my $str = join '-', ( join '', @letters ) x $length; $debug2 and print $str; my $re = join join( '-', ( '.*?' ) x 2 ), ('(.)') x $length; $debug2 and print $re; my %combs; $str =~ /$re (?{ $combs{ join '', grep defined, $1, $2, $3, $4, $5 } ++ }) (*FAIL)/x; my @combs = keys %combs; $debug2 and print "all combs: " . @combs; # @combs = grep { !/(.)\1/ } @combs; # $debug2 and print "all combs: " . @combs; @combs = grep { my %uniq; map $uniq{ $_ } ++, split //; $min_set <= keys %uniq; } @combs; $debug2 and print "unique combs: " . @combs; $debug2 and print "@combs"; return @combs; }