#! perl -slw use strict; use Time::HiRes qw[ time ]; use Data::Dump qw[ pp ]; use constant DEBUG => 0; $|=1; my (@sequences,@uniq); for my $magnitude (0..3) { my $factor = 10**$magnitude; init_sequences(6000*$factor,2000,200,400); #init_sequences(6000,20*$factor,200,400); pp \@sequences if DEBUG; my ($res_all,$time_all) = check_all(); pp $res_all if DEBUG; my ($res_longest,$time_longest) = check_longest(); pp $res_longest if DEBUG; print STDERR "Faster: ", $time_all/ $time_longest; } # --------- # subs # --------- sub uniq{ my %x; @x{@_} = (); keys %x } sub check_all { my $start = time; my @uniq = uniq(@sequences); my @result; chomp @uniq; @uniq = sort{ length $a <=> length $b } @uniq; my $all = join chr(0), @uniq; my $p = 0; for my $x ( @uniq ) { $p += 1+ length $x; next if 1+ index $all, $x, $p; ## COrrected per LanX below. push @result, $x ; } my $runtime=time() - $start; printf STDERR "Check_all \t took %.3f\t",$runtime; print STDERR "Filtered: ",scalar @result ," Ratio: ",@sequences / @result; return \@result,$runtime; } sub check_longest { my $start = time; my @uniq = uniq(@sequences); chomp @uniq; @uniq = sort{ length $b <=> length $a } @uniq; my $longest= shift @uniq; for my $x ( @uniq ) { next if 1+ index $longest, $x; $longest .= "\n" . $x; } my $runtime=time() - $start; printf STDERR "Check_longest \t took %.3f\t",$runtime; my @result=split "\n",$longest; print STDERR "Filtered: ",scalar @result ," Ratio: ",@sequences / @result; return \@result,$runtime; } sub init_sequences { my $length_dna=shift; my $num=shift; my $min=shift; my $max=shift; print STDERR "\n--- Init ..."; my $dna; @sequences=(); for (1 .. $length_dna) { $dna .= (qw/A C G T N/)[int(rand 5)] } print length $dna,": ",$dna if DEBUG; for (1 .. $num) { my $length= $min + int ( rand ($max -$min +1) ); my $offset= int ( rand ($length_dna - $length +1) ); # pp $length,$offset; push @sequences, substr $dna,$offset,$length; } print STDERR " ... completed: $num snippets from $min to $max of $length_dna long DNA"; }