in reply to Re^2: Question about speeding a regexp count
in thread Question about speeding a regexp count
A little over 10x the data takes a little over 10x as long, 36 seconds, though it needed a few tweaks to stop it from blowing gobs of memory:
#! perl -slw use strict; use Time::HiRes qw[ time ]; my $sequence = join'', map{ ( qw[ A C G T ] )[ rand 4 ] } 1 .. 100_000 +; $sequence x= 65; my $start = time; my( %one, %two, %three ); print 'starting one'; for( my $i=0; $i < length( $sequence ) - 100_000; $i += 100_000 ) { $one{ $_ }++ for unpack '(A1)*', substr $sequence, $i, 100_000; } print 'starting two'; for( my $i=0; $i < length( $sequence ) - 99_999; $i += 99_999 ) { $two{ $_ }++ for unpack '(A2X)*', substr $sequence, $i, 100_000; } delete @two{ (qw'A C G T') };; print 'starting three'; for( my $i=0; $i < length( $sequence ) - 99_999; $i += 99_999 ) { $three{ $_ }++ for unpack '(A3XX)*', substr $sequence, $i, 100_002 +; } delete @three{ (qw'AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT') } +; printf "Elapsed: %3f seconds\n", time - $start; <STDIN>; print "one\n@{[%one]}"; print "two\n@{[%two ]}"; print "three\n@{[%three]}"; __END__ P:\test>499988 starting one starting two starting three Elapsed: 36.257410 seconds one A 1604992 T 1613184 C 1589696 G 1592128 two AC 404168 CC 403645 TG 409301 AT 408133 AA 413074 CT 409560 CG 399941 +TA 409562 GC 400133 CA 401370 GT 407740 AG 404684 TC 406569 GA 406053 TT 412940 +GG 403062 three GCC 97240 AGT 100880 TGT 105820 TGA 99385 CGA 99580 ATC 103155 AAC 102 +765 AGC 100490 TAC 99060 TCG 100490 ACA 96460 CTG 103155 CCG 97500 GCA 986 +70 GTG 100230 AAG 100750 CAC 98995 GTT 105820 AGA 102310 ACC 101660 CCA 1 +03480 TGG 101985 CGC 98475 CTC 98995 TTG 105300 TAA 102700 CAG 100880 ACG 10 +1400 AAA 105365 ATG 100620 GTA 100360 CTT 102570 TAG 104650 GGA 104780 GTC +101335 TGC 102115 TCA 102765 ATT 102765 TAT 103155 AAT 104195 ACT 104650 GAC +103350 CAA 100230 GGT 98865 TCC 104130 TTT 101790 AGG 101010 CGT 102180 CGG 9 +9710 CAT 101270 ATA 101595 CCC 100620 GGG 100360 GAG 98410 TTA 102765 GAT 9 +9515 CTA 104845 TCT 99190 TTC 103090 GCG 100555 GGC 99060 GAA 104780 GCT 10 +3675 CCT 102050
|
|---|