#!/usr/bin/perl use strict; use warnings; use Data::Dumper; my %seqs; # slurp the file containing the sequences you want to find into a scalar # like this # open FILE, $finds or die "Can't open $finds, Perl says $!\n"; # do { local $/; $file = } # close FILE; # simulate the file slurp result thusly my $file = 'AAA GGG AAAGGG TTTATAATA AGA ATA TTT'; print "METHOD 1\n\n"; # use a hash of hashes to store compiled regexes and also count (below) for my $seq (split "\n", $file) { $seqs{$seq}->{'re'} = qr/\Q$seq/; } # process the big file line by line (use DATA filehandle in simulation) while () { for my $seq (keys %seqs) { $seqs{$seq}->{'count'}++ for m/$seqs{$seq}->{'re'}/g; } } print Dumper \%seqs; print "\n\n\nMETHOD 2\n\n"; # re-read data, need to fix seek bug on DATA filehandle for simulation # also clear %seqs hash.... seek DATA, 0,0; my $bugfix; $bugfix = until $bugfix and $bugfix eq "__DATA__\n"; %seqs = (); # generate a regex that searches for all the sequences # sorted according to length to find longest possible matches # note this method will miss overlaps (see Data::Dumper output)..... my $re = join '|', sort {length $b <=> length $a} split "\n", $file; # compile the regex only once using qr $re = qr/($re)/; # process the big file line by line (use DATA filehandle in simulation) while () { # get all the matches on each line $seqs{$_}++ for m/$re/g; } print Dumper \%seqs __DATA__ AAAGGGAAA TTTATAATA GGGTTTATA CCCTTTCCC UUUUUUUUU TTTGGGATA