#!/usr/bin/perl -w use strict; use Data::Dumper; my $hash = { 'S1' => [ 'A','B','C','D','H','A' ], 'S2' => [ 'A','C','D','B','G','J' ], 'S3' => [ 'C', 'A', 'D', 'H','M','K' ], 'S4' => [ 'A', 'B', 'I', 'C','I','D' ] }; my $seq1 = 'A C D'; # there is always a space between the query chars #This two are for testing my $seq2 = 'A B C'; my $seq3 = 'A B'; my @alignment = align($hash,$seq); print Dumper \@alignment; sub align { my ($hashref,$seq) = @_; my @seq_ar = split(/\s/,$seq); my $test = join('(.*?)',@seq_ar); #Attempt to find how many possible gap my $count_stars = $test =~ tr/\*//; print "Stars: $count_stars\n"; my @hyph_padded_seq; for my $key( sort {$a cmp $b} keys(%$hashref)) { my $string = join('',@{$hashref->{$key}}); # whatever strings that go here are # guaranteed to contain $seq if($string =~ $test) { my $gap_sum; # I may be doing something silly here # Here I'm trying to find the sum of the gap # which is supposed to be captured by # regex memory variable $1, $2 .. etc using loop for ( 1..$count_stars ) { # Creating memory variable $1,$2..etc # which doesnt' work # and create error my $mem = '$'.$_; # counting (chars) gap that is stored in memory variable my $count_gap = $mem =~ tr/[A-Z]//; $gap_sum += $count_gap; # Next what I intended to do is # to replace 2 corresponding consecutive chars # with "-" of size $1 $2 of highest $gap_sum...etc # then push every newly created seq with (-) # into array @hyph_padded_seq # I really am stuck here. } } } return @hyph_padded_seq; } #### Query: "A C D" Query: "A B C" Query: "A B" Query: "C D" Answers: Answers: Answers: Answers: [ [ [ [ 'AB-C-D', 'ABIC', 'A--B', 'C-D', 'A--C-D', 'AB_C' 'ACDB', 'C-D', 'ABICID', ] 'A--B' 'CAD', ] ] 'CID' ] #### Take Query "A C D" as an example: 'S1' => [ 'A', 'B', 'C', 'D','H','A' ], 'S2' => [ 'A', 'C', 'D', 'B','G','J' ], 'S3' => [ 'C', 'A', 'D', 'H','M','K' ], 'S4' => [ 'A', 'B', 'I', 'C','I','D' ] 1. String "A C D" can be found only in S1,S2,S4. Thus, it was array from S1,S2,S4 that is taken for alignment. 2. See, S4->"ABICID" gives the biggest gap compare to S1,S2. As shown here: S4->"ABICID" gives A[BI]C[I]D, since there are 2 + 1 gaps. 2 for 'BI', 1 for 'I' S1->"ABCDHA" gives A[B]CD, only 1 gaps for [B] S2->"ACDBGJ" gives ACD, without any gaps in between. 3. Then S1,S2 must be align *based* on S4. That means the maximum span of S1, S2 is until D ends. S1->"ABCD" S2->"ACD" These are the two strings we align with S4->"ABICID" 4. Now, lets align S1->"ABCD" with S4->"ABICID" In S1, there are 1 gap in ABC in comparison to S4 like this: S1-> AB-C S4-> ABIC There is another 1 gap in CD in comparison to S4 like this: S1-> C-D S4-> CID Thus the full alignment between S1 and S4 are: S1-> AB-C-D S4-> ABICID 5. Then align S2->"ACD" with S4->"ABICID" In S2, there are 2 gaps in AC in comparison to S4 like this: S2-> A--C S4-> ABIC There is another 1 gap in CD in comparison to S4 like this: S2-> C-D S4-> CID Thus the full alignment between S1 and S4 are: S2-> A--C-D S4-> ABICID 6. The final alignment gives: S1-> AB-C-D S2-> A--C-D S4-> ABICID