HAC10188
Name HAC1018
Extension .fsa
Dye Green1
Allele Count 298
Size 100 100.5 101 102.1 102.6 103.3 104.1
NChar 297
Allele 0$0 0$0 1$1503 0$0 0$0 0$0
####
my $tax_sample = $data->{$taxon_label}{'Samples'}{$dye};
my $rep_sample = $data->{$rep_label} {'Samples'}{$dye};
my $nchar = ${$tax_sample->{'NChar'}}[0];
# I don't set these row sum variables to zero, because I want them undefined ifthe row is empty.
my ($row_sum, $row_sum_abs, $row_sum_sq);
my ($cnt00, $cnt01, $cnt10, $cnt11) = (0) x 4;
my ($first_site, $last_site) = $use_filtered ? @{$thresholds->{$dye}{'indices'}} : (0, $nchar-1);
####
#!/usr/bin/perl
# AFLP Replicate Difference Calculator
# Version 1.1
# Prints a table of differences between AFLP replicates for a given parameter.
# Forms part of the "Genotyping Utilities Package"
# Copyright (C) 2007 Warwick P M Allen
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
#
# For author's details see http://awcmee.massey.ac.nz/aflp/aflp.html
##############
# CHANGE LOG #
##############
#
# 2007-04-23 WPMA: Created.
use warnings FATAL => 'all';
use strict;
use genotyping_utilities qw( &canonpath &openForWriting &readGTR &getRepsTable &transpose &isReal &getReplicateGroups );
sub safeDiv($$) {defined( $_[0] ) && defined( $_[1] ) && $_[1] ? $_[0]/$_[1] : undef}
our $USAGE = "$0 [options] []\n";
my $HELP = <".
A typical command line might look something like:
./AFLP_replicate_difference.pl -p "Peak Area" -r 0.01 rep_table.txt Genotypes_Table.gtr >Size_differences.txt
You can even feed the output of Genotyper Rearranger straight into the Repli-
cate Difference Calculator:
./genotyper_rearranger.pl gtr -o - 'Genotypes Table.txt' | ./AFLP_replicate_difference.pl -p Size rep_table.txt >Size_differences.txt
Note that the file 'genotyper_rearranger.pm' must be in the path (just put it
in the same directory as this script).
Options:
-a Display differences for all sites. The default is to display
--all-sites only the differences where the site has been called differently
for each of the two replicates under test.
-f Only consider the sites that weren't filtered out by GeneMapper.
--filter Note that columns will still exist for filtered sites, but they
will be empty.
-h
--help Display this help.
-p
--parameter
Use as the parameter to compare between
the replicates. Note that this is case-sensitive and, to keep
generality, there is no checking to ensure it is a canonical
field name. Check in the input file to make sure that you have
exactly the same field name. Several difference parameters may
be requested. If no difference parameter is given, "Height" is
used.
-o
--base-output-name
Selects the first part of the name of the output files.
-Q
--remove-quotes
Strip single quotes from around sample labels in the replicates
table file. If this is selected, then tab characters inside
single quote marks will be treated as part of the sample label
(but doing this is strongly discouraged because it will confuse
other programs). The default is to treat every character that is
is a tab as part of the sample label.
-r
--rounding
Rounds the differences to the nearest . The rounding is
done last, so round-off errors are not propagated through to the
sums at the end of each row and column.
--strip-extensions
Removes the extension (that is, the last dot and every character
that follows it) from the sample labels given in the repetitions
table file. This can be useful because, for GeneMapper input,
the GenoTyper Rearranger script uses the "Sample File" field,
stripped of its extension, as the sample's label.
-T
--transpose Transpose the output.
HELP
# Option defaults.
my $all_sites = 0; # If false, only print differences where the site has been called
# differently in each of the two replicates under test.
my $indicate_reps_using = undef; # If this is defined, it is the string that delimits the sample name from
# the replicate identifier. # TODO
my @difference_parameters = ();
my $rounding = undef;
my %reps_table_options = (
'remove_quotes' => 1,
'strip_extensions'=>0
);
my $use_filtered = 0;
my $transpose = 0;
my $base_fname;
# Process options.
while (@ARGV && $ARGV[0] =~ /^-/) {
$_ = shift;
# split single-letter options that are joined together
/^-([^-]\S+)/ and do {unshift @ARGV, map "-$_", split //, $1} or
/^-(?:a|-all-sites)/ and do {$all_sites = 1} or
/^-(?:f|-filter)/ and do {$use_filtered = 1} or
/^-(?:h|-help)/ and do {print( $HELP ), exit} or
/^-(?:p|-parameter)/ and do {push @difference_parameters, shift; 1} or
/^-(?:o|-base-output-name)/ and do {$base_fname = canonpath shift; 1} or
/^-(?:Q|-remove-quotes)/ and do {$reps_table_options{'strip_extensions'} = 1} or
/^-(?:r|-rounding)/ and do {$rounding = shift; 1} or
/^--strip-extensions/ and do {$reps_table_options{'strip_extensions'} = 1; 1} or
/^-(?:T|-transpose)/ and do {$transpose = 1} or
/^-/ and die "Unknown option `$_'.\nType `$0 --help' for available options.";
}
# Height is the default parameter for which to calculate the differences.
push @difference_parameters, 'Height' if !@difference_parameters;
# Get hash table of replicates: each key in the hash table is a samle name, and points to an array of all of the
# other sample names that are replicates of each other. The array includes the sample name that is the hash key.
# Many hash keys can point to the same array.
my $reps_table = getRepsTable \%reps_table_options, $indicate_reps_using, @ARGV ? shift() : undef;
# Read in the data.
my ($thresholds, $data, $comments);
{
if (@ARGV) {
my $gtr_fname = canonpath $ARGV[0];
open GTR_IN, $gtr_fname or die "Cannot open `$gtr_fname' for reading: $!";
} else {
print STDERR "Reading data from standard input ...\n";
*GTR_IN = *STDIN;
}
($thresholds, $data, $comments) = readGTR *GTR_IN;
close GTR_IN;
}
# Find the maximum number of sites over all the dyes.
my $global_nchar = 0;
foreach my $dye (keys %$thresholds) {
# my $nchar = ${$thresholds->{$dye}{$use_filtered ? 'nchar (filtered)': 'nchar'}}[0];
my $nchar = ${$thresholds->{$dye}{'nchar'}}[0];
$global_nchar = $nchar if $nchar > $global_nchar;
}
# Generate the difference table.
my (%differences, %totals, %grand_totals);
foreach my $difference_parameter (@difference_parameters) {
foreach my $dye (keys %$thresholds) {
foreach my $replicate_group (getReplicateGroups( $data, $reps_table )) {
next if @$replicate_group < 2; # Only one replicate in this group.
my ($taxon_label, $rep_label) = @$replicate_group;
my $tax_sample = $data->{$taxon_label}{'Samples'}{$dye};
my $rep_sample = $data->{$rep_label} {'Samples'}{$dye};
my $nchar = ${$tax_sample->{'NChar'}}[0];
# I don't set these row sum variables to zero, because I want them undefined ifthe row is empty.
my ($row_sum, $row_sum_abs, $row_sum_sq);
my ($cnt00, $cnt01, $cnt10, $cnt11) = (0) x 4;
my ($first_site, $last_site) = $use_filtered ? @{$thresholds->{$dye}{'indices'}} : (0, $nchar-1);
my @differences = (undef) x $nchar;
for (my $i = $first_site; $i <= $last_site; $i++) {
!${$tax_sample->{'Allele'}}[$i] && !${$rep_sample->{'Allele'}}[$i] and $cnt00++ or
!${$tax_sample->{'Allele'}}[$i] && ${$rep_sample->{'Allele'}}[$i] and $cnt01++ or
${$tax_sample->{'Allele'}}[$i] && !${$rep_sample->{'Allele'}}[$i] and $cnt10++ or
${$tax_sample->{'Allele'}}[$i] && ${$rep_sample->{'Allele'}}[$i] and $cnt11++ ;
next unless $all_sites || (${$tax_sample->{'Allele'}}[$i] xor ${$rep_sample->{'Allele'}}[$i]);
my $tax_val = ${$tax_sample->{$difference_parameter}}[$i]; next if !defined $tax_val || $tax_val =~ /^\s*$/;
my $rep_val = ${$rep_sample->{$difference_parameter}}[$i]; next if !defined $rep_val || $rep_val =~ /^\s*$/;
$differences[$i] = $tax_val - $rep_val;
$row_sum += $differences[$i] ;
$row_sum_abs += abs( $differences[$i] ) ;
$row_sum_sq += $differences[$i]**2;
}
# Sanity check.
die "Count mismatch: $cnt00 + $cnt01 + $cnt10 + $cnt11 != $last_site - $first_site + 1"
if $cnt00 + $cnt01 + $cnt10 + $cnt11 != $last_site - $first_site + 1;
$differences{$difference_parameter}{$dye}{$taxon_label}{$rep_label} = {
'Differences' => \@differences,
'Sum' => $row_sum,
'Sum of Absolutes' => $row_sum_abs,
'Sum of Squares' => $row_sum_sq,
'No. of Sites' => $nchar,
'No. of Sites without Trailing All-Zero Sites' => (${$tax_sample->{'Allele Count'}}[0]), # HACK
'No. of Sites after Filtering' => ($last_site - $first_site + 1), # HACK
'First Site' => $first_site,
'Last Site' => $last_site,
'No. of Internal All-Zero Sites' => ${$thresholds->{$dye}{'all-zero site count'}}[0],
'No. of Internal All-Zero Sites after Filtering' => ${$thresholds->{$dye}{'all-zero site count (filtered)'}}[0],
'(00)' => $cnt00,
'(01)' => $cnt01,
'(10)' => $cnt10,
'(11)' => $cnt11,
'(01+10)' => ( $cnt01 + $cnt10 ),
'(01+10+11)' => ( $cnt01 + $cnt10 + $cnt11),
'(00+01+10+11)' => ($cnt00 + $cnt01 + $cnt10 + $cnt11),
'(Sum of Abs)/(01+10)' => safeDiv( $row_sum_abs, $cnt01 + $cnt10 ),
'(Sum of Abs)/(01+10+11)' => safeDiv( $row_sum_abs, $cnt01 + $cnt10 + $cnt11 ),
'(Sum of Abs)/(00+01+10+11)'=> safeDiv( $row_sum_abs, $cnt00 + $cnt01 + $cnt10 + $cnt11 )
}
}
foreach my $first_label (keys %{$differences{$difference_parameter}{$dye}}) {
foreach my $second_label (keys %{$differences{$difference_parameter}{$dye}{$first_label}}) {
my $x = $differences{$difference_parameter}{$dye}{$first_label}{$second_label};
foreach (keys %$x) {
if (ref $x->{$_} eq 'ARRAY') {
if (!exists $totals{$difference_parameter}{$dye}{$_}) {
$totals{$difference_parameter}{$dye}{$_} = [(undef) x @{$x->{$_}}];
}
for (my $i = 0; $i < @{$x->{$_}}; $i++) {
defined( ${$x->{$_}}[$i] ) && isReal( ${$x->{$_}}[$i] ) and
${$totals{$difference_parameter}{$dye}{$_}}[$i] += ${$x->{$_}}[$i];
}
} else {
defined( $x->{$_} ) && isReal( $x->{$_} ) and
$totals{$difference_parameter}{$dye}{$_} += $x->{$_};
}
}
}
}
foreach (keys %{$totals{$difference_parameter}{$dye}}) {
defined( $totals{$difference_parameter}{$dye}{$_} ) and
$grand_totals{$difference_parameter}{$_} += $totals{$difference_parameter}{$dye}{$_};
}
}
}
my %error_rate;
foreach (@difference_parameters) {
$error_rate{$_} = [
safeDiv( $grand_totals{$_}{'(01+10)'}, $grand_totals{$_}{ '(01+10+11)'} ),
safeDiv( $grand_totals{$_}{'(01+10)'}, $grand_totals{$_}{'(00+01+10+11)'} ),
safeDiv( $grand_totals{$_}{'(01+10)'},
(defined $grand_totals{$_}{'No. of Internal All-Zero Sites after Filtering'} ?
$grand_totals{$_}{ '(01+10+11)'} - $grand_totals{$_}{'No. of Internal All-Zero Sites after Filtering'} : undef) ),
safeDiv( $grand_totals{$_}{'(01+10)'},
(defined $grand_totals{$_}{'No. of Internal All-Zero Sites after Filtering'} ?
$grand_totals{$_}{'(00+01+10+11)'} - $grand_totals{$_}{'No. of Internal All-Zero Sites after Filtering'} : undef) )
]
}
# Round.
if (defined $rounding) {
my $round = sub (;$) {
local $_ = @_ ? $_[0] : $_;
isReal and sprintf( '%d', $_/$rounding )*$rounding;
};
foreach my $difference_parameter (@difference_parameters) {
foreach my $dye (keys %{$differences{$difference_parameter}}) {
foreach my $first_rep (keys %{$differences{$difference_parameter}{$dye}}) {
foreach my $second_rep (keys %{$differences{$difference_parameter}{$dye}{$first_rep}}) {
my $diff = $differences{$difference_parameter}{$dye}{$first_rep}{$second_rep};
foreach (keys %$diff) {
if (ref $diff->{$_} eq 'ARRAY') {
$_ = &$round foreach @{$diff->{$_}}
} else {
$diff->{$_} = &$round( $diff->{$_} );
}
}
}
}
foreach (keys %{$totals{$difference_parameter}{$dye}}) {
if (ref $totals{$difference_parameter}{$dye}{$_} eq 'ARRAY') {
map {&$round()} @{$totals{$difference_parameter}{$dye}{$_}};
} else {
$totals{$difference_parameter}{$dye}{$_} = &$round( $totals{$difference_parameter}{$dye}{$_} );
}
}
foreach (keys %{$totals{$difference_parameter}{$dye}}) {
$grand_totals{$difference_parameter}{$_} = &$round( $grand_totals{$difference_parameter}{$_} );
}
}
map {$_ = &$round()} @{$error_rate{$difference_parameter}};
}
}
# Generate the output matrices.
my %output;
foreach my $difference_parameter (@difference_parameters) {
push @{$output{$difference_parameter}}, ["Difference table for $difference_parameter"], [];
push @{$output{$difference_parameter}}, [ map {"'$_'"}
'First replicate',
'Second replicate',
'Dye',
(0..$global_nchar+1), #HACK
'Sum',
'Sum of Absolutes',
'Sum of Squares',
'No. of Sites',
'No. of Sites without Trailing All-Zero Sites',
'No. of Sites after Filtering',
'First Site',
'Last Site',
'No. of Internal All-Zero Sites',
'No. of Internal All-Zero Sites after Filtering',
'(00)',
'(01)',
'(10)',
'(11)',
'(01+10)',
'(01+10+11)',
'(00+01+10+11)',
'(Sum of Abs)/(01+10)',
'(Sum of Abs)/(01+10+11)',
'(Sum of Abs)/(00+01+10+11)'
];
foreach my $dye (keys %{$differences{$difference_parameter}}) {
foreach my $first_rep (keys %{$differences{$difference_parameter}{$dye}}) {
foreach my $second_rep (keys %{$differences{$difference_parameter}{$dye}{$first_rep}}) {
my $diff = $differences{$difference_parameter}{$dye}{$first_rep}{$second_rep};
push @{$output{$difference_parameter}}, [
$first_rep,
$second_rep,
$dye,
map( {defined() ? $_ : ''} @{$diff->{'Differences'}} ),
map( {defined() ? $_ : ''} map( {$diff->{$_}}
'Sum',
'Sum of Absolutes',
'Sum of Squares',
'No. of Sites',
'No. of Sites without Trailing All-Zero Sites',
'No. of Sites after Filtering',
'First Site',
'Last Site',
'No. of Internal All-Zero Sites',
'No. of Internal All-Zero Sites after Filtering',
'(00)',
'(01)',
'(10)',
'(11)',
'(01+10)',
'(01+10+11)',
'(00+01+10+11)',
'(Sum of Abs)/(01+10)',
'(Sum of Abs)/(01+10+11)',
'(Sum of Abs)/(00+01+10+11)'
) )
];
}
}
my @totals_row = ('Sum', undef, $dye);
foreach (
'Differences',
'Sum',
'Sum of Absolutes',
'Sum of Squares',
'No. of Sites',
'No. of Sites without Trailing All-Zero Sites',
'No. of Sites after Filtering',
'First Site',
'Last Site',
'No. of Internal All-Zero Sites',
'No. of Internal All-Zero Sites after Filtering',
'(00)',
'(01)',
'(10)',
'(11)',
'(01+10)',
'(01+10+11)',
'(00+01+10+11)',
'(Sum of Abs)/(01+10)',
'(Sum of Abs)/(01+10+11)',
'(Sum of Abs)/(00+01+10+11)'
) {
if (ref $totals{$difference_parameter}{$dye}{$_} eq 'ARRAY') {
push @totals_row, @{$totals{$difference_parameter}{$dye}{$_}}
} else {
push @totals_row, $totals{$difference_parameter}{$dye}{$_}
}
}
push @{$output{$difference_parameter}}, [], \@totals_row, [], [];
}
# Transpose the table.
$output{$difference_parameter} = transpose $output{$difference_parameter} if $transpose;
push @{$output{$difference_parameter}},
[], ["Totals Over All Dyes"], [],
map( {["'$_'", '', $grand_totals{$difference_parameter}{$_}]}
'Sum',
'Sum of Absolutes',
'Sum of Squares',
'No. of Sites',
'No. of Sites without Trailing All-Zero Sites',
'No. of Sites after Filtering',
'First Site',
'Last Site',
'No. of Internal All-Zero Sites',
'No. of Internal All-Zero Sites after Filtering',
'(00)',
'(01)',
'(10)',
'(11)',
'(01+10)',
'(01+10+11)',
'(00+01+10+11)',
),
["'(01+10)/(01+10+11)'", '', ${$error_rate{$difference_parameter}}[0]],
["'(01+10)/(00+01+10+11)'", '', ${$error_rate{$difference_parameter}}[1]],
["'(01+10)/((01+10+11) - (No. of Internal All-Zero Sites after Filtering))'", '', ${$error_rate{$difference_parameter}}[2]],
["'(01+10)/((00+01+10+11) - (No. of Internal All-Zero Sites after Filtering))'", '', ${$error_rate{$difference_parameter}}[3]],
[];
}
# Get the base file name.
$base_fname = getBaseFName( @ARGV && $ARGV[0], $comments ) unless defined $base_fname;
# Print.
{
my $output_fname = $base_fname . '.rdc';
*OUT = defined $output_fname ? openForWriting $output_fname : *STDOUT;
local ($\,$,) = ("\n","\t");
foreach my $difference_parameter (@difference_parameters) {
print OUT map {defined() ? $_ : ''} @$_ foreach @{$output{$difference_parameter}};
print OUT "";
}
close( OUT ),
print( STDERR "Wrote `$output_fname'." ) if defined $output_fname;
}
#my %heights;
## Create "reps-in-row.heights" table.
#{
# local ($,, $/) = ("\t", "\n");
# my $file_number = 1;
# foreach my $replicate_group (@rep_groups) {
# next if @$replicate_group < 2;
# my $number_of_site_for_this_group = 0;
# $number_of_site_for_this_group += ${ $data->{(keys %$data)[0]}{'Samples'}{$_}{'NChar'} }[0] foreach keys %$threshold_indices;
# foreach my $dye (keys %$threshold_indices) {
# my $nchar = ${ $data->{(keys %$data)[0]}{'Samples'}{$dye}{'NChar'} }[0];
# for (my $i = 1; $i <= $nchar; $i++) {
# my $height_is_registered =
# defined( ${$replicate_group}[0] ) && isReal( ${$data->{${$replicate_group}[0]}{'Samples'}{$dye}{'Height'}}[$i] ) &&
# defined( ${$replicate_group}[1] ) && isReal( ${$data->{${$replicate_group}[1]}{'Samples'}{$dye}{'Height'}}[$i] );
# my $height_meets_threshold = $height_is_registered &&
# ${$data->{${$replicate_group}[0]}{'Samples'}{$dye}{'Height'}}[$i] >= $peak_height_threshold &&
# ${$data->{${$replicate_group}[1]}{'Samples'}{$dye}{'Height'}}[$i] >= $peak_height_threshold;
# if ($height_meets_threshold) {
# push @{$heights{$dye}}, [
# $dye,
# $i,
# @$replicate_group,
# (defined ${$replicate_group}[0] ? ${$data->{${$replicate_group}[0]}{'Samples'}{$dye}{'Height'}}[$i] : ''),
# (defined ${$replicate_group}[1] ? ${$data->{${$replicate_group}[1]}{'Samples'}{$dye}{'Height'}}[$i] : ''),
# (defined ${$replicate_group}[0] && defined ${$replicate_group}[1] ?
# abs( ${$data->{${$replicate_group}[0]}{'Samples'}{$dye}{'Height'}}[$i] -
# ${$data->{${$replicate_group}[1]}{'Samples'}{$dye}{'Height'}}[$i] ) : '' )
# ];
# }
# }
# }
# }
#
# $file_number = 1;
# foreach my $dye (keys %$threshold_indices) {
# *OUT = openForWriting $base_fname.'.reps-in-row.heights'.$file_number.'.tab.txt';
# print OUT 'Dye', 'Site', 'Replicate 1', 'Replicate 2', 'Height 1', 'Height 2', 'Height Difference';
# $heights{$dye} = [sort {${$a}[6] <=> ${$b}[6]} @{$heights{$dye}}];
# map {print OUT @$_} @{$heights{$dye}};
# close OUT;
# print STDERR "Wrote `".$base_fname.'.reps-in-row.heights'.($file_number++).'.tab.txt'."'.";
# }
#}
####
# Reads in GTR file.
# Takes a filehandle opened for reading. Returns two hash references. One contains information specific to
# particular dyes: the threshold values and indices (the threshold determine which sites get filtered out) and
# the number of character in the filtered and unfiltered ...
sub readGTR($) {
*GTR_IN = shift;
local $/ = getLineEnding *GTR_IN;
my (%thresholds, %data, %comments, @labels);
local $_;
while () {
chomp;
$comments{$.} = $1,
next if /^#(.*)/;
next if /^$/;
last if /^\%DATA%$/;
my $dye = $_;
while () {
chomp;
$comments{$.} = $1,
next if /^#(.*)/;
last if /^$/;
my ($key, $value) = /^\s*(\w.*?):\t(.*)$/ or die "Error reading line $.:\n$_\n";
# Have to append a dummy character to $value to force split() to return trailing empty fields.
@{$thresholds{$dye}{$key}} = map {$_ eq 'undef' ? undef : $_} split /\t/, $value.chr(0);
substr( ${$thresholds{$dye}{$key}}[-1], -1 ) = ""; # Remove the dummy character.
}
}
while () {
chomp;
next if /^$/ || /^#/;
my $taxon_label = $_;
push @labels, $taxon_label; # To record the order of the taxa.
while ( =~ /\s*(.*?)\t(.*)$/) {
$data{$taxon_label}{$1} = $2;
}
my $dye;
while () {
chomp;
next if /^#/;
last if /^$/;
($dye) = /^\s*Dye\s+(.*)/;
while () {
chomp;
next if /^#/;
last if /^$/;
my @values = split /\t/;
my $key = shift @values;
$data{$taxon_label}{'Samples'}{$dye}{$key} = \@values;
}
}
}
return \%thresholds, \%data, \%comments, \@labels;
}