HAC10188 Name HAC1018 Extension .fsa Dye Green1 Allele Count 298 Size 100 100.5 101 102.1 102.6 103.3 104.1 NChar 297 Allele 0$0 0$0 1$1503 0$0 0$0 0$0 #### my $tax_sample = $data->{$taxon_label}{'Samples'}{$dye}; my $rep_sample = $data->{$rep_label} {'Samples'}{$dye}; my $nchar = ${$tax_sample->{'NChar'}}[0]; # I don't set these row sum variables to zero, because I want them undefined ifthe row is empty. my ($row_sum, $row_sum_abs, $row_sum_sq); my ($cnt00, $cnt01, $cnt10, $cnt11) = (0) x 4; my ($first_site, $last_site) = $use_filtered ? @{$thresholds->{$dye}{'indices'}} : (0, $nchar-1); #### #!/usr/bin/perl # AFLP Replicate Difference Calculator # Version 1.1 # Prints a table of differences between AFLP replicates for a given parameter. # Forms part of the "Genotyping Utilities Package" # Copyright (C) 2007 Warwick P M Allen # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # # # For author's details see http://awcmee.massey.ac.nz/aflp/aflp.html ############## # CHANGE LOG # ############## # # 2007-04-23 WPMA: Created. use warnings FATAL => 'all'; use strict; use genotyping_utilities qw( &canonpath &openForWriting &readGTR &getRepsTable &transpose &isReal &getReplicateGroups ); sub safeDiv($$) {defined( $_[0] ) && defined( $_[1] ) && $_[1] ? $_[0]/$_[1] : undef} our $USAGE = "$0 [options] []\n"; my $HELP = <". A typical command line might look something like: ./AFLP_replicate_difference.pl -p "Peak Area" -r 0.01 rep_table.txt Genotypes_Table.gtr >Size_differences.txt You can even feed the output of Genotyper Rearranger straight into the Repli- cate Difference Calculator: ./genotyper_rearranger.pl gtr -o - 'Genotypes Table.txt' | ./AFLP_replicate_difference.pl -p Size rep_table.txt >Size_differences.txt Note that the file 'genotyper_rearranger.pm' must be in the path (just put it in the same directory as this script). Options: -a Display differences for all sites. The default is to display --all-sites only the differences where the site has been called differently for each of the two replicates under test. -f Only consider the sites that weren't filtered out by GeneMapper. --filter Note that columns will still exist for filtered sites, but they will be empty. -h --help Display this help. -p --parameter Use as the parameter to compare between the replicates. Note that this is case-sensitive and, to keep generality, there is no checking to ensure it is a canonical field name. Check in the input file to make sure that you have exactly the same field name. Several difference parameters may be requested. If no difference parameter is given, "Height" is used. -o --base-output-name Selects the first part of the name of the output files. -Q --remove-quotes Strip single quotes from around sample labels in the replicates table file. If this is selected, then tab characters inside single quote marks will be treated as part of the sample label (but doing this is strongly discouraged because it will confuse other programs). The default is to treat every character that is is a tab as part of the sample label. -r --rounding Rounds the differences to the nearest . The rounding is done last, so round-off errors are not propagated through to the sums at the end of each row and column. --strip-extensions Removes the extension (that is, the last dot and every character that follows it) from the sample labels given in the repetitions table file. This can be useful because, for GeneMapper input, the GenoTyper Rearranger script uses the "Sample File" field, stripped of its extension, as the sample's label. -T --transpose Transpose the output. HELP # Option defaults. my $all_sites = 0; # If false, only print differences where the site has been called # differently in each of the two replicates under test. my $indicate_reps_using = undef; # If this is defined, it is the string that delimits the sample name from # the replicate identifier. # TODO my @difference_parameters = (); my $rounding = undef; my %reps_table_options = ( 'remove_quotes' => 1, 'strip_extensions'=>0 ); my $use_filtered = 0; my $transpose = 0; my $base_fname; # Process options. while (@ARGV && $ARGV[0] =~ /^-/) { $_ = shift; # split single-letter options that are joined together /^-([^-]\S+)/ and do {unshift @ARGV, map "-$_", split //, $1} or /^-(?:a|-all-sites)/ and do {$all_sites = 1} or /^-(?:f|-filter)/ and do {$use_filtered = 1} or /^-(?:h|-help)/ and do {print( $HELP ), exit} or /^-(?:p|-parameter)/ and do {push @difference_parameters, shift; 1} or /^-(?:o|-base-output-name)/ and do {$base_fname = canonpath shift; 1} or /^-(?:Q|-remove-quotes)/ and do {$reps_table_options{'strip_extensions'} = 1} or /^-(?:r|-rounding)/ and do {$rounding = shift; 1} or /^--strip-extensions/ and do {$reps_table_options{'strip_extensions'} = 1; 1} or /^-(?:T|-transpose)/ and do {$transpose = 1} or /^-/ and die "Unknown option `$_'.\nType `$0 --help' for available options."; } # Height is the default parameter for which to calculate the differences. push @difference_parameters, 'Height' if !@difference_parameters; # Get hash table of replicates: each key in the hash table is a samle name, and points to an array of all of the # other sample names that are replicates of each other. The array includes the sample name that is the hash key. # Many hash keys can point to the same array. my $reps_table = getRepsTable \%reps_table_options, $indicate_reps_using, @ARGV ? shift() : undef; # Read in the data. my ($thresholds, $data, $comments); { if (@ARGV) { my $gtr_fname = canonpath $ARGV[0]; open GTR_IN, $gtr_fname or die "Cannot open `$gtr_fname' for reading: $!"; } else { print STDERR "Reading data from standard input ...\n"; *GTR_IN = *STDIN; } ($thresholds, $data, $comments) = readGTR *GTR_IN; close GTR_IN; } # Find the maximum number of sites over all the dyes. my $global_nchar = 0; foreach my $dye (keys %$thresholds) { # my $nchar = ${$thresholds->{$dye}{$use_filtered ? 'nchar (filtered)': 'nchar'}}[0]; my $nchar = ${$thresholds->{$dye}{'nchar'}}[0]; $global_nchar = $nchar if $nchar > $global_nchar; } # Generate the difference table. my (%differences, %totals, %grand_totals); foreach my $difference_parameter (@difference_parameters) { foreach my $dye (keys %$thresholds) { foreach my $replicate_group (getReplicateGroups( $data, $reps_table )) { next if @$replicate_group < 2; # Only one replicate in this group. my ($taxon_label, $rep_label) = @$replicate_group; my $tax_sample = $data->{$taxon_label}{'Samples'}{$dye}; my $rep_sample = $data->{$rep_label} {'Samples'}{$dye}; my $nchar = ${$tax_sample->{'NChar'}}[0]; # I don't set these row sum variables to zero, because I want them undefined ifthe row is empty. my ($row_sum, $row_sum_abs, $row_sum_sq); my ($cnt00, $cnt01, $cnt10, $cnt11) = (0) x 4; my ($first_site, $last_site) = $use_filtered ? @{$thresholds->{$dye}{'indices'}} : (0, $nchar-1); my @differences = (undef) x $nchar; for (my $i = $first_site; $i <= $last_site; $i++) { !${$tax_sample->{'Allele'}}[$i] && !${$rep_sample->{'Allele'}}[$i] and $cnt00++ or !${$tax_sample->{'Allele'}}[$i] && ${$rep_sample->{'Allele'}}[$i] and $cnt01++ or ${$tax_sample->{'Allele'}}[$i] && !${$rep_sample->{'Allele'}}[$i] and $cnt10++ or ${$tax_sample->{'Allele'}}[$i] && ${$rep_sample->{'Allele'}}[$i] and $cnt11++ ; next unless $all_sites || (${$tax_sample->{'Allele'}}[$i] xor ${$rep_sample->{'Allele'}}[$i]); my $tax_val = ${$tax_sample->{$difference_parameter}}[$i]; next if !defined $tax_val || $tax_val =~ /^\s*$/; my $rep_val = ${$rep_sample->{$difference_parameter}}[$i]; next if !defined $rep_val || $rep_val =~ /^\s*$/; $differences[$i] = $tax_val - $rep_val; $row_sum += $differences[$i] ; $row_sum_abs += abs( $differences[$i] ) ; $row_sum_sq += $differences[$i]**2; } # Sanity check. die "Count mismatch: $cnt00 + $cnt01 + $cnt10 + $cnt11 != $last_site - $first_site + 1" if $cnt00 + $cnt01 + $cnt10 + $cnt11 != $last_site - $first_site + 1; $differences{$difference_parameter}{$dye}{$taxon_label}{$rep_label} = { 'Differences' => \@differences, 'Sum' => $row_sum, 'Sum of Absolutes' => $row_sum_abs, 'Sum of Squares' => $row_sum_sq, 'No. of Sites' => $nchar, 'No. of Sites without Trailing All-Zero Sites' => (${$tax_sample->{'Allele Count'}}[0]), # HACK 'No. of Sites after Filtering' => ($last_site - $first_site + 1), # HACK 'First Site' => $first_site, 'Last Site' => $last_site, 'No. of Internal All-Zero Sites' => ${$thresholds->{$dye}{'all-zero site count'}}[0], 'No. of Internal All-Zero Sites after Filtering' => ${$thresholds->{$dye}{'all-zero site count (filtered)'}}[0], '(00)' => $cnt00, '(01)' => $cnt01, '(10)' => $cnt10, '(11)' => $cnt11, '(01+10)' => ( $cnt01 + $cnt10 ), '(01+10+11)' => ( $cnt01 + $cnt10 + $cnt11), '(00+01+10+11)' => ($cnt00 + $cnt01 + $cnt10 + $cnt11), '(Sum of Abs)/(01+10)' => safeDiv( $row_sum_abs, $cnt01 + $cnt10 ), '(Sum of Abs)/(01+10+11)' => safeDiv( $row_sum_abs, $cnt01 + $cnt10 + $cnt11 ), '(Sum of Abs)/(00+01+10+11)'=> safeDiv( $row_sum_abs, $cnt00 + $cnt01 + $cnt10 + $cnt11 ) } } foreach my $first_label (keys %{$differences{$difference_parameter}{$dye}}) { foreach my $second_label (keys %{$differences{$difference_parameter}{$dye}{$first_label}}) { my $x = $differences{$difference_parameter}{$dye}{$first_label}{$second_label}; foreach (keys %$x) { if (ref $x->{$_} eq 'ARRAY') { if (!exists $totals{$difference_parameter}{$dye}{$_}) { $totals{$difference_parameter}{$dye}{$_} = [(undef) x @{$x->{$_}}]; } for (my $i = 0; $i < @{$x->{$_}}; $i++) { defined( ${$x->{$_}}[$i] ) && isReal( ${$x->{$_}}[$i] ) and ${$totals{$difference_parameter}{$dye}{$_}}[$i] += ${$x->{$_}}[$i]; } } else { defined( $x->{$_} ) && isReal( $x->{$_} ) and $totals{$difference_parameter}{$dye}{$_} += $x->{$_}; } } } } foreach (keys %{$totals{$difference_parameter}{$dye}}) { defined( $totals{$difference_parameter}{$dye}{$_} ) and $grand_totals{$difference_parameter}{$_} += $totals{$difference_parameter}{$dye}{$_}; } } } my %error_rate; foreach (@difference_parameters) { $error_rate{$_} = [ safeDiv( $grand_totals{$_}{'(01+10)'}, $grand_totals{$_}{ '(01+10+11)'} ), safeDiv( $grand_totals{$_}{'(01+10)'}, $grand_totals{$_}{'(00+01+10+11)'} ), safeDiv( $grand_totals{$_}{'(01+10)'}, (defined $grand_totals{$_}{'No. of Internal All-Zero Sites after Filtering'} ? $grand_totals{$_}{ '(01+10+11)'} - $grand_totals{$_}{'No. of Internal All-Zero Sites after Filtering'} : undef) ), safeDiv( $grand_totals{$_}{'(01+10)'}, (defined $grand_totals{$_}{'No. of Internal All-Zero Sites after Filtering'} ? $grand_totals{$_}{'(00+01+10+11)'} - $grand_totals{$_}{'No. of Internal All-Zero Sites after Filtering'} : undef) ) ] } # Round. if (defined $rounding) { my $round = sub (;$) { local $_ = @_ ? $_[0] : $_; isReal and sprintf( '%d', $_/$rounding )*$rounding; }; foreach my $difference_parameter (@difference_parameters) { foreach my $dye (keys %{$differences{$difference_parameter}}) { foreach my $first_rep (keys %{$differences{$difference_parameter}{$dye}}) { foreach my $second_rep (keys %{$differences{$difference_parameter}{$dye}{$first_rep}}) { my $diff = $differences{$difference_parameter}{$dye}{$first_rep}{$second_rep}; foreach (keys %$diff) { if (ref $diff->{$_} eq 'ARRAY') { $_ = &$round foreach @{$diff->{$_}} } else { $diff->{$_} = &$round( $diff->{$_} ); } } } } foreach (keys %{$totals{$difference_parameter}{$dye}}) { if (ref $totals{$difference_parameter}{$dye}{$_} eq 'ARRAY') { map {&$round()} @{$totals{$difference_parameter}{$dye}{$_}}; } else { $totals{$difference_parameter}{$dye}{$_} = &$round( $totals{$difference_parameter}{$dye}{$_} ); } } foreach (keys %{$totals{$difference_parameter}{$dye}}) { $grand_totals{$difference_parameter}{$_} = &$round( $grand_totals{$difference_parameter}{$_} ); } } map {$_ = &$round()} @{$error_rate{$difference_parameter}}; } } # Generate the output matrices. my %output; foreach my $difference_parameter (@difference_parameters) { push @{$output{$difference_parameter}}, ["Difference table for $difference_parameter"], []; push @{$output{$difference_parameter}}, [ map {"'$_'"} 'First replicate', 'Second replicate', 'Dye', (0..$global_nchar+1), #HACK 'Sum', 'Sum of Absolutes', 'Sum of Squares', 'No. of Sites', 'No. of Sites without Trailing All-Zero Sites', 'No. of Sites after Filtering', 'First Site', 'Last Site', 'No. of Internal All-Zero Sites', 'No. of Internal All-Zero Sites after Filtering', '(00)', '(01)', '(10)', '(11)', '(01+10)', '(01+10+11)', '(00+01+10+11)', '(Sum of Abs)/(01+10)', '(Sum of Abs)/(01+10+11)', '(Sum of Abs)/(00+01+10+11)' ]; foreach my $dye (keys %{$differences{$difference_parameter}}) { foreach my $first_rep (keys %{$differences{$difference_parameter}{$dye}}) { foreach my $second_rep (keys %{$differences{$difference_parameter}{$dye}{$first_rep}}) { my $diff = $differences{$difference_parameter}{$dye}{$first_rep}{$second_rep}; push @{$output{$difference_parameter}}, [ $first_rep, $second_rep, $dye, map( {defined() ? $_ : ''} @{$diff->{'Differences'}} ), map( {defined() ? $_ : ''} map( {$diff->{$_}} 'Sum', 'Sum of Absolutes', 'Sum of Squares', 'No. of Sites', 'No. of Sites without Trailing All-Zero Sites', 'No. of Sites after Filtering', 'First Site', 'Last Site', 'No. of Internal All-Zero Sites', 'No. of Internal All-Zero Sites after Filtering', '(00)', '(01)', '(10)', '(11)', '(01+10)', '(01+10+11)', '(00+01+10+11)', '(Sum of Abs)/(01+10)', '(Sum of Abs)/(01+10+11)', '(Sum of Abs)/(00+01+10+11)' ) ) ]; } } my @totals_row = ('Sum', undef, $dye); foreach ( 'Differences', 'Sum', 'Sum of Absolutes', 'Sum of Squares', 'No. of Sites', 'No. of Sites without Trailing All-Zero Sites', 'No. of Sites after Filtering', 'First Site', 'Last Site', 'No. of Internal All-Zero Sites', 'No. of Internal All-Zero Sites after Filtering', '(00)', '(01)', '(10)', '(11)', '(01+10)', '(01+10+11)', '(00+01+10+11)', '(Sum of Abs)/(01+10)', '(Sum of Abs)/(01+10+11)', '(Sum of Abs)/(00+01+10+11)' ) { if (ref $totals{$difference_parameter}{$dye}{$_} eq 'ARRAY') { push @totals_row, @{$totals{$difference_parameter}{$dye}{$_}} } else { push @totals_row, $totals{$difference_parameter}{$dye}{$_} } } push @{$output{$difference_parameter}}, [], \@totals_row, [], []; } # Transpose the table. $output{$difference_parameter} = transpose $output{$difference_parameter} if $transpose; push @{$output{$difference_parameter}}, [], ["Totals Over All Dyes"], [], map( {["'$_'", '', $grand_totals{$difference_parameter}{$_}]} 'Sum', 'Sum of Absolutes', 'Sum of Squares', 'No. of Sites', 'No. of Sites without Trailing All-Zero Sites', 'No. of Sites after Filtering', 'First Site', 'Last Site', 'No. of Internal All-Zero Sites', 'No. of Internal All-Zero Sites after Filtering', '(00)', '(01)', '(10)', '(11)', '(01+10)', '(01+10+11)', '(00+01+10+11)', ), ["'(01+10)/(01+10+11)'", '', ${$error_rate{$difference_parameter}}[0]], ["'(01+10)/(00+01+10+11)'", '', ${$error_rate{$difference_parameter}}[1]], ["'(01+10)/((01+10+11) - (No. of Internal All-Zero Sites after Filtering))'", '', ${$error_rate{$difference_parameter}}[2]], ["'(01+10)/((00+01+10+11) - (No. of Internal All-Zero Sites after Filtering))'", '', ${$error_rate{$difference_parameter}}[3]], []; } # Get the base file name. $base_fname = getBaseFName( @ARGV && $ARGV[0], $comments ) unless defined $base_fname; # Print. { my $output_fname = $base_fname . '.rdc'; *OUT = defined $output_fname ? openForWriting $output_fname : *STDOUT; local ($\,$,) = ("\n","\t"); foreach my $difference_parameter (@difference_parameters) { print OUT map {defined() ? $_ : ''} @$_ foreach @{$output{$difference_parameter}}; print OUT ""; } close( OUT ), print( STDERR "Wrote `$output_fname'." ) if defined $output_fname; } #my %heights; ## Create "reps-in-row.heights" table. #{ # local ($,, $/) = ("\t", "\n"); # my $file_number = 1; # foreach my $replicate_group (@rep_groups) { # next if @$replicate_group < 2; # my $number_of_site_for_this_group = 0; # $number_of_site_for_this_group += ${ $data->{(keys %$data)[0]}{'Samples'}{$_}{'NChar'} }[0] foreach keys %$threshold_indices; # foreach my $dye (keys %$threshold_indices) { # my $nchar = ${ $data->{(keys %$data)[0]}{'Samples'}{$dye}{'NChar'} }[0]; # for (my $i = 1; $i <= $nchar; $i++) { # my $height_is_registered = # defined( ${$replicate_group}[0] ) && isReal( ${$data->{${$replicate_group}[0]}{'Samples'}{$dye}{'Height'}}[$i] ) && # defined( ${$replicate_group}[1] ) && isReal( ${$data->{${$replicate_group}[1]}{'Samples'}{$dye}{'Height'}}[$i] ); # my $height_meets_threshold = $height_is_registered && # ${$data->{${$replicate_group}[0]}{'Samples'}{$dye}{'Height'}}[$i] >= $peak_height_threshold && # ${$data->{${$replicate_group}[1]}{'Samples'}{$dye}{'Height'}}[$i] >= $peak_height_threshold; # if ($height_meets_threshold) { # push @{$heights{$dye}}, [ # $dye, # $i, # @$replicate_group, # (defined ${$replicate_group}[0] ? ${$data->{${$replicate_group}[0]}{'Samples'}{$dye}{'Height'}}[$i] : ''), # (defined ${$replicate_group}[1] ? ${$data->{${$replicate_group}[1]}{'Samples'}{$dye}{'Height'}}[$i] : ''), # (defined ${$replicate_group}[0] && defined ${$replicate_group}[1] ? # abs( ${$data->{${$replicate_group}[0]}{'Samples'}{$dye}{'Height'}}[$i] - # ${$data->{${$replicate_group}[1]}{'Samples'}{$dye}{'Height'}}[$i] ) : '' ) # ]; # } # } # } # } # # $file_number = 1; # foreach my $dye (keys %$threshold_indices) { # *OUT = openForWriting $base_fname.'.reps-in-row.heights'.$file_number.'.tab.txt'; # print OUT 'Dye', 'Site', 'Replicate 1', 'Replicate 2', 'Height 1', 'Height 2', 'Height Difference'; # $heights{$dye} = [sort {${$a}[6] <=> ${$b}[6]} @{$heights{$dye}}]; # map {print OUT @$_} @{$heights{$dye}}; # close OUT; # print STDERR "Wrote `".$base_fname.'.reps-in-row.heights'.($file_number++).'.tab.txt'."'."; # } #} #### # Reads in GTR file. # Takes a filehandle opened for reading. Returns two hash references. One contains information specific to # particular dyes: the threshold values and indices (the threshold determine which sites get filtered out) and # the number of character in the filtered and unfiltered ... sub readGTR($) { *GTR_IN = shift; local $/ = getLineEnding *GTR_IN; my (%thresholds, %data, %comments, @labels); local $_; while () { chomp; $comments{$.} = $1, next if /^#(.*)/; next if /^$/; last if /^\%DATA%$/; my $dye = $_; while () { chomp; $comments{$.} = $1, next if /^#(.*)/; last if /^$/; my ($key, $value) = /^\s*(\w.*?):\t(.*)$/ or die "Error reading line $.:\n$_\n"; # Have to append a dummy character to $value to force split() to return trailing empty fields. @{$thresholds{$dye}{$key}} = map {$_ eq 'undef' ? undef : $_} split /\t/, $value.chr(0); substr( ${$thresholds{$dye}{$key}}[-1], -1 ) = ""; # Remove the dummy character. } } while () { chomp; next if /^$/ || /^#/; my $taxon_label = $_; push @labels, $taxon_label; # To record the order of the taxa. while ( =~ /\s*(.*?)\t(.*)$/) { $data{$taxon_label}{$1} = $2; } my $dye; while () { chomp; next if /^#/; last if /^$/; ($dye) = /^\s*Dye\s+(.*)/; while () { chomp; next if /^#/; last if /^$/; my @values = split /\t/; my $key = shift @values; $data{$taxon_label}{'Samples'}{$dye}{$key} = \@values; } } } return \%thresholds, \%data, \%comments, \@labels; }