######## now read in taxid2locus file and print out taxids for host ## easier for trees and solves problem for wgs genomes my $tax2locus_file="/g/bork6/waller/Viruses/prophage_data/tax2locus.dat"; my %tax2loc; open (IN,$tax2locus_file) or die "cannot open $tax2locus_file\n"; while(){ chomp; my($taxid,$locus)=split(/\t/,$_); $tax2loc{$locus}=$taxid; } close(IN); print "there are\t".scalar(keys %tax2loc)."\tlocus_ids as key in hash\n"; ########### subroutine get host and taxid ### sub getDetails() { my ($proph,$lu)=@_; my $host;my $taxid;my $genNum_proph; # print "\t$proph\tproph inside sub"; my @proph_columns=split(/\_/,$proph); # get host info if ($proph_columns[0] =~ /^FP92/){ $host=$proph_columns[0]; } #elsif ($proph_columns[0] =~ /^NZ/){ # $host=join("_",$proph_columns[0],$proph_columns[1]); # $host=substr $host, 0, 7; #} else{ $host=join("_",$proph_columns[0],$proph_columns[1]); } # print "\t$host\thost from inside sub"; #get taxid my @matching_keys= grep { $_=~ /^$host/ } keys %$lu; my $matching_key=$matching_keys[0]; if (exists $lu->{$matching_key}){ *** $taxid=$lu->{$matching_key}; # print "\t$taxid\ttaxid from inside sub\t"; } elsif (!exists $lu->{$matching_key}){ $taxid="undef"; } ## get number of proteins in the prophage if (exists $genesPerProphGen{$proph}){ $genNum_proph=$genesPerProphGen{$proph}; # print "\t$genNum_proph\tfrom inside sub\n" } else{ print "$proph doesn't have number of genes\n" } return ($host,$taxid,$genNum_proph); } ##### Now make table with numbers ##### my $shared_table="$mci_file.JaccHost.tabv2"; my $genNum_prophA;my $genNum_prophB;my $host_prophA;my $host_prophB; open (OUTs,">$shared_table"); print OUTs "# This table was generated using $0 on\t".scalar(localtime(time))."\n"; print OUTs "#".join("\t",qw(Proph_genomeA Proph_genomeB hostA taxidA hostB taxidB Jacc Prots_Shared Protsin_prophA Protsin_prophB))."\n"; foreach my $prophA (keys %$overlap){ print "$prophA\tfrom overlap\t"; my($hostA,$taxidA,$genNum_prophA)=&getDetails($prophA,\%tax2loc); # print "from sub\t $hostA,$taxidA,$genNum_prophA\n"; foreach my $prophB (keys %{$overlap->{$prophA}}){ my($hostB,$taxidB,$genNum_prophB)=&getDetails($prophB,\%tax2loc); # print "from sub\t $hostB,$taxidB,$genNum_prophB\n"; my $A_notShared=$genNum_prophA-$overlap->{$prophA}{$prophB}; my $B_notShared=$genNum_prophB-$overlap->{$prophA}{$prophB}; my $Jac_denom=$overlap->{$prophA}{$prophB}+$A_notShared+$B_notShared; my $Jac=$overlap->{$prophA}{$prophB}/$Jac_denom; print OUTs join("\t",$prophA,$prophB,$hostA,$taxidA,$hostB,$taxidB,$Jac,$overlap->{$prophA}{$prophB},$genNum_prophA,$genNum_prophB)."\n"; } } close(OUTs);