--- Init ... ... completed: 2000 snippets from 200 to 400 of 6000 long DNA Check_all took 1.206 Filtered: 287 Ratio: 6.96864111498258 Check_longest took 0.436 Filtered: 287 Ratio: 6.96864111498258 Faster: 2.76641019482413 --- Init ... ... completed: 2000 snippets from 200 to 400 of 60000 long DNA Check_all took 2.990 Filtered: 924 Ratio: 2.16450216450216 Check_longest took 1.908 Filtered: 924 Ratio: 2.16450216450216 Faster: 1.56725918185693 --- Init ... ... completed: 2000 snippets from 200 to 400 of 600000 long DNA Check_all took 4.462 Filtered: 1785 Ratio: 1.12044817927171 Check_longest took 4.058 Filtered: 1785 Ratio: 1.12044817927171 Faster: 1.09963952407683 --- Init ... ... completed: 2000 snippets from 200 to 400 of 6000000 long DNA Check_all took 4.601 Filtered: 1977 Ratio: 1.01163378856854 Check_longest took 4.651 Filtered: 1977 Ratio: 1.01163378856854 Faster: 0.989378646160973 #### #! perl -slw use strict; use Time::HiRes qw[ time ]; use Data::Dump qw[ pp ]; use constant DEBUG => 0; $|=1; my (@sequences,@uniq); for my $magnitude (0..3) { my $factor = 10**$magnitude; init_sequences(6000*$factor,2000,200,400); #init_sequences(6000,20*$factor,200,400); pp \@sequences if DEBUG; my ($res_all,$time_all) = check_all(); pp $res_all if DEBUG; my ($res_longest,$time_longest) = check_longest(); pp $res_longest if DEBUG; print STDERR "Faster: ", $time_all/ $time_longest; } # --------- # subs # --------- sub uniq{ my %x; @x{@_} = (); keys %x } sub check_all { my $start = time; my @uniq = uniq(@sequences); my @result; chomp @uniq; @uniq = sort{ length $a <=> length $b } @uniq; my $all = join chr(0), @uniq; my $p = 0; for my $x ( @uniq ) { $p += 1+ length $x; next if 1+ index $all, $x, $p; ## COrrected per LanX below. push @result, $x ; } my $runtime=time() - $start; printf STDERR "Check_all \t took %.3f\t",$runtime; print STDERR "Filtered: ",scalar @result ," Ratio: ",@sequences / @result; return \@result,$runtime; } sub check_longest { my $start = time; my @uniq = uniq(@sequences); chomp @uniq; @uniq = sort{ length $b <=> length $a } @uniq; my $longest= shift @uniq; for my $x ( @uniq ) { next if 1+ index $longest, $x; $longest .= "\n" . $x; } my $runtime=time() - $start; printf STDERR "Check_longest \t took %.3f\t",$runtime; my @result=split "\n",$longest; print STDERR "Filtered: ",scalar @result ," Ratio: ",@sequences / @result; return \@result,$runtime; } sub init_sequences { my $length_dna=shift; my $num=shift; my $min=shift; my $max=shift; print STDERR "\n--- Init ..."; my $dna; @sequences=(); for (1 .. $length_dna) { $dna .= (qw/A C G T N/)[int(rand 5)] } print length $dna,": ",$dna if DEBUG; for (1 .. $num) { my $length= $min + int ( rand ($max -$min +1) ); my $offset= int ( rand ($length_dna - $length +1) ); # pp $length,$offset; push @sequences, substr $dna,$offset,$length; } print STDERR " ... completed: $num snippets from $min to $max of $length_dna long DNA"; }