#!/usr/bin/perl use strict; use warnings; my $tax2locus_file; my %tax2loc; open my $in, '<', $tax2locus_file or die "Can't open $tax2locus_file: $!\n"; while (<$in>) { chomp; my ($taxid, $locus) = split /\t/; $tax2loc{$locus} = $taxid; } close ($in); print "there are\t" . keys (%tax2loc) . "\tlocus_ids as key in hash\n"; ############### Now read in sharedTab file with pairwise overlap info my $sharedTab_file = $ARGV[0]; my $outfile = "$sharedTab_file.hostinfo"; open my $out, '>', $outfile or die "Can't create $outfile: $!\n"; open $in, '<', $sharedTab_file or die "Can't open $sharedTab_file: $!\n"; print $out "#prophageA\tprophageB\thostA\ttaxidA\thostB\ttaxidB\tjacc\n"; while (<$in>) { chomp; next if (/^#/); # ignore comments my @columns = split (/\t/, $_); my ($prophageA, $hostA, $taxidA) = getTaxId($columns[0]); my ($prophageB, $hostB, $taxidB) = getTaxId($columns[0]); print $out join ("\t", $prophageA, $prophageB, $hostA, $taxidA, $hostB, $taxidB, $columns[5]), "\n"; } sub getTaxId { my ($prophage, $lu) = @_; my ($host, $PFnum) = split /\./, $prophage; ## for wgs genomes just match first 7 characters as only NZ_XXXX000000 are ## in tax2locus $host =~ s/^(NZ.{5}).*/$1/; my @matches = grep {$_ =~ /$host/} keys %$lu; die "Expected exactly one match for $host. Got " . scalar @matches . "\n"; return $prophage, $host, $matches[0]; }