# always start your program with these two lines. # The do a lot of your error checking for you and are # more accurate than your eyeballs. use strict; use warnings; #----------------------------------------- # store counts in a hash of array references. # - There is one hash key for each DNA base # - The value assigned to each hash key is an array # reference. Its Nth element stores the number of times # that base appears at the Nth position. # (N=0 is first position) #----------------------------------------- my %hFrequency; my @aTotalAtPos; my $iMaxSequenceLength = 0; my $sBase; my $iPos; # Reads in one line of data from the stream DATA. # see below at __DATA__ for the actual data while (my $sSequence = ){ #remove end of record marker, i.e. newline, from sequence chomp $sSequence; # regex // can be used to split a string into characters my @aBases = split(//, $sSequence); # keep track of maximum sequence length: we'll need it # later to print out the matrix my $iSequenceLength = scalar(@aBases); if ($iSequenceLength > $iMaxSequenceLength) { $iMaxSequenceLength = $iSequenceLength; } # up the count for each base/position found for($iPos=0; $iPos <= $#aBases; $iPos++) { $sBase = $aBases[$iPos]; #current char $hFrequency{$sBase}[$iPos] ++; $aTotalAtPos[$iPos]++; } } #----------------------------------------- # print the matrix # done with sprintf, but you might prefer # to use standard Perl formatting. #----------------------------------------- # use a constant so we make sure we have the # same format for the base column each time # we print out a row my $BASE_FORMAT = "%4s |"; # use a constant so we make sure we have the # same width each time we print out a row my $POS_WIDTH = 5; # print out header print sprintf($BASE_FORMAT, 'Base'); #label row for($iPos=0; $iPos < $iMaxSequenceLength; $iPos++) { print sprintf("%${POS_WIDTH}d |", $iPos); } print "\n"; #end row # print out divider bar below header print '---- |'; #label row for($iPos=0; $iPos < $iMaxSequenceLength; $iPos++) { print (('-' x $POS_WIDTH) . ' |'); } print "\n"; #end row # print out one row for each base foreach $sBase (sort keys %hFrequency) { my $aCounts = $hFrequency{$sBase}; print sprintf($BASE_FORMAT, $sBase); #label row # $aCounts is an array reference # @$aCounts extracts the array for($iPos=0; $iPos < $iMaxSequenceLength; $iPos++) { my $iCount = $aCounts->[$iPos]; my $iTotal = $aTotalAtPos[$iPos]; $iCount = 0 unless defined($iCount); my $dPct = $iTotal ? $iCount/$iTotal : 0; print sprintf("%-$POS_WIDTH.2f |", $dPct); } print "\n"; #end row } # This is a quick way to put in some test data # To read it in you use data as a file handle # see above for an example __DATA__ ACCGT AGCCG CATTC GTAAA