#!/software/bin/perl use strict; use warnings ; use File::Basename ; use Getopt::Long; use File::Basename; use File::Listing; use Data::Dumper; use Algorithm::Permute ; use Tie::IxHash; use Array::Each; GetOptions( "if=s" => \my $input_file, "sf=s" => \my $source_file, "od=s" => \my $outputdir, "d=s" => \my $date, ); if(! $outputdir) {$outputdir = "/nfs/users/nfs_a/aj6/CGP/Sequenom/perl/tests/"}; my $filename = fileparse($input_file); #Output locations and names defaults. my $top_hits = $outputdir . "/$date" ."_seq_comparison_top_hits.csv"; my $out = $outputdir . "/$date" ."_seq_comparison_summary.csv" ; my $log = $outputdir . "/$date" ."_seq_comparison_log.log"; #my ($sample_set,$plex_set) = parseGenotypeFile($input_file) ; my ($source_set) = ParseSourcefile($source_file); my @samples = sort keys %{ $source_set }; foreach my $sam(@samples){ print SUM $sam ,","; } # print SUM "\n"; # my @redo_samples; my %rest_samples; my $samples_report; for( my $i= 0 ; $i < scalar( @samples ); $i++ ){ my $s1 = $samples[ $i ]; #my $rsid1 = $sample_set->{ $s1 }; #get the source genotype results for the sample s1 my $geno1 = $source_set->{ $s1 }; my $count = 0; print SUM "$s1,"; #to get the samples with first 20 callrates as NN among the 30 and to get all the samples which has 50 NNs among the 96. my $one_sample1 = Array::Each->new(\@$geno1); while(my ($g1, $index) = $one_sample1->each() ) { if($g1 =~ /^NN/i) { $count++; if($count >=20 && $index ==30){ push @redo_samples , $s1; # print "$s1|@$geno1\n"; } if($count >=50 && $index == 96 ) { #push @$samples_report, $s1; print LOG "$s1|@$geno1\n"; } } } }#end of for $i #get the samples that has first20 pass callrates for samples. #difference between the @samples and @redo_samples. %rest_samples = map{$_=>1} @redo_samples; my @sam2com = grep ( ! defined $rest_samples{$_}, @samples); #print foreach "@sam2com\n"; my $com_sam = {}; for (my $j = 0; $j<=scalar(@sam2com);$j++){ my $s1 = $sam2com[ $j ] ; my $geno1 = $source_set->{$s1}; my $top_percent = 0; my $top = ''; for (my $k = 0;$k<=scalar(@sam2com);$k++){ my $match = 0; my $s2 = $sam2com[ $k ]; my $geno2 = $source_set->{$s2}; my $set = Array::Each->new(\@$geno1, \@$geno2); while (my($g1, $g2, $index2) = $set->each() ){ #print "$s1|$s2|$g1|$g2|$index2\n"; next if $g1 eq "" || $g2 eq ""; next if $g1 =~ /^NN/i || $g2 =~ /^NN/i; if($g1 eq $g2){ $match++; #print "$s1|$s2|$g1|$g2|$match|$index2\n"; }#end of if $g1 eq $g2 }#end of while loop $set->each my $percentage = sprintf "%.2f", ($match * 100)/( scalar @$geno1 ) ; print SUM $percentage, ","; next if ($percentage < 75); #print "$s1|$s2|$percentage\n"; if( ( $percentage >=$top_percent) and ($top ne $s1 ) ){ $top_percent= $percentage; $top = $s2; } #end of if $percentage >=$top_percent) and $top ne $s1 push @{ $com_sam->{ $s1 }->{ $percentage } }, { sample =>$s2, percent =>$percentage, match =>$match }; my $ge1 = join "", @$geno1; my $ge2 = join "", @$geno2; if( ( $ge1 eq $ge2 ) and ( $s1 ne $s2 ) ) { print LOG "$s1|$s2|$ge1 | $ge2\n"; } }#end of for $k sam2com print SUM "\n"; #sort by percentage in desending order. Get the samples match to other sample percentage of match and top hit . foreach my $percent ( sort { $b <=> $a } keys %{ $com_sam->{ $s1 } } ){ my $match_samples = $com_sam->{ $s1 }->{ $percent }; foreach my $matSam( @ { $match_samples } ){ if( ( $s1 ne $matSam->{ sample } ) and ($matSam->{ percent } >= $top_percent) ) { print LOG "Sample $s1 matches with $matSam->{ sample } with $matSam->{ percent }\n"; } #else{ my $l = sprintf "%s, %s, %0.2f, %s, %0.2f ", $s1, $matSam->{ sample }, $matSam->{ percent }, $top, $top_percent; print OUT $l,"\n"; #} }#end of forach $match->sample }#end of percentage foreach loop }#end of for $j @sam2com. ##-------------------------------- sub ParseSourcefile{ my $file = shift; my $source = {}; open(SFOFN,"$source_file") or usage("FATAL ERROR: could not open file '$input_file' $!\n") ; while(){ chomp; next if /^Sample/; my ($sam_name, @genotype ) = split(',', $_); $source->{$sam_name} = \@genotype; }#end of while loop close SFOFN; return($source); }#end of sub ParseSourcefile