#!/usr/bin/perl
use strict;
use warnings;

my $tax2locus_file;
my %tax2loc;

open my $in, '<', $tax2locus_file or die "Can't open $tax2locus_file: $!\n";

while (<$in>) {
    chomp;
    my ($taxid, $locus) = split /\t/;
    $tax2loc{$locus} = $taxid;
}
close ($in);

print "there are\t" . keys (%tax2loc) . "\tlocus_ids as key in hash\n";

############### Now read in sharedTab file with pairwise overlap info

my $sharedTab_file = $ARGV[0];
my $outfile        = "$sharedTab_file.hostinfo";

open my $out, '>', $outfile or die "Can't create $outfile: $!\n";
open $in, '<', $sharedTab_file or die "Can't open $sharedTab_file: $!\n";

print $out "#prophageA\tprophageB\thostA\ttaxidA\thostB\ttaxidB\tjacc\n";

while (<$in>) {
    chomp;
    next if (/^#/);    # ignore comments
    my @columns = split (/\t/, $_);
    my ($prophageA, $hostA, $taxidA) = getTaxId($columns[0]);
    my ($prophageB, $hostB, $taxidB) = getTaxId($columns[0]);

    print $out join ("\t",
        $prophageA, $prophageB, $hostA, $taxidA, $hostB, $taxidB, $columns[5]),
        "\n";
}


sub getTaxId {
    my ($prophage, $lu) = @_;
    my ($host, $PFnum) = split /\./, $prophage;

    ## for wgs genomes just match first 7 characters as only NZ_XXXX000000 are
    ## in tax2locus
    $host =~ s/^(NZ.{5}).*/$1/;

    my @matches = grep {$_ =~ /$host/} keys %$lu;

    die "Expected exactly one match for $host. Got " . scalar @matches . "\n";
    return $prophage, $host, $matches[0];
}