#!/usr/bin/perl -w
use strict;
use Data::Dumper;
my $hash = {
'S1' => [ 'A','B','C','D','H','A' ],
'S2' => [ 'A','C','D','B','G','J' ],
'S3' => [ 'C', 'A', 'D', 'H','M','K' ],
'S4' => [ 'A', 'B', 'I', 'C','I','D' ]
};
my $seq1 = 'A C D'; # there is always a space between the query chars
#This two are for testing
my $seq2 = 'A B C';
my $seq3 = 'A B';
my @alignment = align($hash,$seq);
print Dumper \@alignment;
sub align
{
my ($hashref,$seq) = @_;
my @seq_ar = split(/\s/,$seq);
my $test = join('(.*?)',@seq_ar);
#Attempt to find how many possible gap
my $count_stars = $test =~ tr/\*//;
print "Stars: $count_stars\n";
my @hyph_padded_seq;
for my $key( sort {$a cmp $b} keys(%$hashref))
{
my $string = join('',@{$hashref->{$key}});
# whatever strings that go here are
# guaranteed to contain $seq
if($string =~ $test)
{
my $gap_sum;
# I may be doing something silly here
# Here I'm trying to find the sum of the gap
# which is supposed to be captured by
# regex memory variable $1, $2 .. etc using loop
for ( 1..$count_stars )
{
# Creating memory variable $1,$2..etc
# which doesnt' work
# and create error
my $mem = '$'.$_;
# counting (chars) gap that is stored in memory variable
my $count_gap = $mem =~ tr/[A-Z]//;
$gap_sum += $count_gap;
# Next what I intended to do is
# to replace 2 corresponding consecutive chars
# with "-" of size $1 $2 of highest $gap_sum...etc
# then push every newly created seq with (-)
# into array @hyph_padded_seq
# I really am stuck here.
}
}
}
return @hyph_padded_seq;
}
####
Query: "A C D" Query: "A B C" Query: "A B" Query: "C D"
Answers: Answers: Answers: Answers:
[ [ [ [
'AB-C-D', 'ABIC', 'A--B', 'C-D',
'A--C-D', 'AB_C' 'ACDB', 'C-D',
'ABICID', ] 'A--B' 'CAD',
] ] 'CID'
]
####
Take Query "A C D" as an example:
'S1' => [ 'A', 'B', 'C', 'D','H','A' ],
'S2' => [ 'A', 'C', 'D', 'B','G','J' ],
'S3' => [ 'C', 'A', 'D', 'H','M','K' ],
'S4' => [ 'A', 'B', 'I', 'C','I','D' ]
1. String "A C D" can be found only in S1,S2,S4.
Thus, it was array from S1,S2,S4 that is taken for alignment.
2. See, S4->"ABICID" gives the biggest gap compare to S1,S2.
As shown here:
S4->"ABICID" gives A[BI]C[I]D, since there are 2 + 1 gaps.
2 for 'BI', 1 for 'I'
S1->"ABCDHA" gives A[B]CD, only 1 gaps for [B]
S2->"ACDBGJ" gives ACD, without any gaps in between.
3. Then S1,S2 must be align *based* on S4. That means
the maximum span of S1, S2 is until D ends.
S1->"ABCD"
S2->"ACD"
These are the two strings we align with S4->"ABICID"
4. Now, lets align S1->"ABCD" with S4->"ABICID"
In S1, there are 1 gap in ABC in comparison to S4
like this:
S1-> AB-C
S4-> ABIC
There is another 1 gap in CD in comparison to S4
like this:
S1-> C-D
S4-> CID
Thus the full alignment between S1 and S4 are:
S1-> AB-C-D
S4-> ABICID
5. Then align S2->"ACD" with S4->"ABICID"
In S2, there are 2 gaps in AC in comparison to S4
like this:
S2-> A--C
S4-> ABIC
There is another 1 gap in CD in comparison to S4
like this:
S2-> C-D
S4-> CID
Thus the full alignment between S1 and S4 are:
S2-> A--C-D
S4-> ABICID
6. The final alignment gives:
S1-> AB-C-D
S2-> A--C-D
S4-> ABICID