Script #1 #!/usr/bin/perl -w use strict; my $ntca_pattern = "GTA.{8}TAC.{20,24}TA.{3}T"; # Pattern to search for my $sequence = get_genome_sequence("NostocChromosome.nt"); # Sequence to search within # Find and print out all the exact matches # my $exact_pattern = fuzzy_pattern($ntca_pattern, 0); my @exact_matches = match_positions($exact_pattern, $sequence); if (@exact_matches) { print "Exact matches:\n"; print_matches(@exact_matches); } # Now do the same, but allow one mismatch # my $one_mismatch = fuzzy_pattern($ntca_pattern, 1); my @approximate_matches = match_positions($one_mismatch, $sequence); if (@approximate_matches) { print "Matches with one possible mismatch\n"; print_matches(@approximate_matches); } sub print_matches { my (@matches) = @_; foreach my $match (@matches) { print $match, "\n"; } } sub get_genome_sequence { my ($file_path) = @_; open GENOME, "<$file_path" or die "Can't open $file_path: $!\n"; $_ = ; # discard first line my $sequence = ""; while () { chomp($_); # Remove line break s/\r//; # And carriage return $_ = uc $_; # Make the line uppercase; $sequence = $sequence . $_; # Append the line to the sequence } return $sequence; } ##### Black magic past this point # use re qw(eval); use vars qw($matchStart); # match_positions($pattern, $sequence) returns the start and end points of all # places in $sequence matching $pattern. # sub match_positions { my $pattern; local $_; ($pattern, $_) = @_; my @results; local $matchStart; my $instrumentedPattern = qr/(?{ $matchStart = pos() })$pattern/; while (/$instrumentedPattern/g) { my $nextStart = pos(); push @results, "[$matchStart..$nextStart)"; pos() = $matchStart+1; } return @results; } # fuzzy_pattern($pattern, $mismatches) returns a pattern that matches whatever # $pattern matches, except that bases may differ in at most $mismatches # positions. # sub fuzzy_pattern { my ($original_pattern, $mismatches_allowed) = @_; $mismatches_allowed >= 0 or die "Number of mismatches must be greater than or equal to zero\n"; my $new_pattern = make_approximate($original_pattern, $mismatches_allowed); return qr/$new_pattern/; } # make_approximate("ACT", 1) returns "(?:A(?:C.|.T)|.CT)" # make_approximate("ACT", 2) returns "(?:A..|.(?:C.|.T))" # make_approximate("ACGT", 2) returns # "(?:A(?:C..|.(?:G.|.T))|.(?:C(?:G.|.T)|.GT))" # sub make_approximate { my ($pattern, $mismatches_allowed) = @_; if ($mismatches_allowed == 0) { return $pattern } elsif (length($pattern) <= $mismatches_allowed) { $pattern =~ tr/ACTG/./; return $pattern } else { my ($first, $rest) = $pattern =~ /^(.)(.*)/; my $after_match = make_approximate($rest, $mismatches_allowed); if ($first =~ /[ACGT]/) { my $after_miss = make_approximate($rest, $mismatches_allowed-1); return "(?:$first$after_match|.$after_miss)"; } else { return "$first$after_match" } } }