#! perl -slw use strict; use Time::HiRes qw[ time ]; our $MIN //= 5; our $SHORT //= 20; our $NSHORTS //= 10e3; our $BIG //= 5e6; our $S //= 0; $S and srand( $S ); sub rndDna{ my $dna =''; $dna .= ( qw[A C G T] )[ rand 4 ] for 1 .. $_[0]; return $dna; } my $bigDna = rndDna( $BIG ); my @shorts = map rndDna( $SHORT ), 1 .. $NSHORTS; my $start = time; my %idx; push @{ $idx{ substr $bigDna, $_, $MIN } }, $_ for 0 .. length( $bigDna ) - $MIN; printf STDERR "Indexing took %.9f secs\n", time() -$start ; $start = time; my $found = 0; for my $short ( @shorts ) { for my $pos ( @{ $idx{ my $pre = substr $short, 0, $MIN } } ) { if( substr( $bigDna, $pos, $SHORT ) eq $short ) { print "$short found at $pos"; ++$found; } } } print STDERR "Found: ", $found; printf STDERR "Lookup of $NSHORTS $SHORT-nt short reads against a $BIG-nt sequence took %.9f secs\n", time() -$start ; __END__ C:\test>1120869-2 -BIG=5e6 -MIN=5 -SHORT=14 -NSHORTS=10000 -S=1 | wc -l Indexing took 3.365217924 secs Found: 182 Lookup of 10000 14-nt short reads against a 5e6-nt sequence took 25.568705082 secs 182 C:\test>1120869-2 -BIG=5e6 -MIN=6 -SHORT=14 -NSHORTS=10000 -S=1 | wc -l Indexing took 3.351562023 secs Found: 182 Lookup of 10000 14-nt short reads against a 5e6-nt sequence took 6.666512966 secs 182 C:\test>1120869-2 -BIG=5e6 -MIN=7 -SHORT=14 -NSHORTS=10000 -S=1 | wc -l Indexing took 4.609364986 secs Found: 182 Lookup of 10000 14-nt short reads against a 5e6-nt sequence took 1.699807882 secs 182 C:\test>1120869-2 -BIG=5e6 -MIN=7 -SHORT=14 -NSHORTS=10000 -S=1 | wc -l Indexing took 4.519516945 secs Found: 182 Lookup of 10000 14-nt short reads against a 5e6-nt sequence took 1.682117939 secs 182 C:\test>1120869-2 -BIG=5e6 -MIN=8 -SHORT=14 -NSHORTS=10000 -S=1 | wc -l Indexing took 6.175763845 secs Found: 182 Lookup of 10000 14-nt short reads against a 5e6-nt sequence took 0.424978018 secs 182 C:\test>1120869-2 -BIG=5e6 -MIN=10 -SHORT=14 -NSHORTS=10000 -S=1 | wc -l Indexing took 9.214743853 secs Found: 182 Lookup of 10000 14-nt short reads against a 5e6-nt sequence took 0.058255196 secs 182 #### 1 x 5e6nt sequence; 10e3 x 14nt short reads; Index prefix: indexing(secs): lookup(secs): found: memory(MB): 5 3.37 25.57 182 200 6 3.38 6.66 182 210 7 4.06 1.69 182 230 8 6.17 0.42 182 244 10 9.21 0.05 182 450