Thanks for the comments. I have significantly reworked the code, and most of it is working correctly. I can read in the blast results table, correctly assign "Flags" to each row of results. Find the maximum $bitscore for each query. But in the last section, I want to tabulate the number of different "Flags" for each Query, only for those results with $bit_scores within 90% of the top bit score. However, there is an error -> see below. Any help appreciated.
############### read in blast table and parse #######
my $in_blast_tab=$ARGV[0];
open(IN,$in_blast_tab) or die "cannot open $in_blast_tab\n";
my $HoFlg1={}; my $HoFlg2={};
#my $Flag1; my $Flag2;
my %maxBits; my $max_bits=60;
while (my $line=<IN>) {
chomp $line;
my ($Query_id,$strand,$Subj_id,$Perc_iden,$align_len,$num_mm,$
+gap,$q_start,$q_end,$s_start,$s_end,$e_value,$bit_score)=split("\t",$
+line); # extra f
ield of strand
next if ($bit_score <60);
if ($bit_score > $max_bits){
$max_bits=$bit_score;
}
$maxBits{$Query_id}=$max_bits;
my ($Flag1, $Flag2 ) = &Flag( $Subj_id, \%proph_prots, \%euk_p
+rots, \%vir_prots );
$HoFlg1->{$Query_id}->{$Flag1}->{$bit_score}++;
$HoFlg2->{$Query_id}->{$Flag2}->{$bit_score}++;
print join("\t",$Query_id,$Subj_id,$bit_score,$Flag1,$Flag2)."
+\n";
}
## Get a Flag for each query/subject
sub Flag {
my ( $Subj_id, $proph_prots, $euk_prots, $vir_prots ) = @_;
return "Proph", "Phage" if exists $$proph_prots{$Subj_id};
return "Euk", "Euk" if exists $$euk_prots{$Subj_id};
return "Vir", "Phage" if exists $$vir_prots{$Subj_id};
return "Bact", "Bact";
} ## end sub Flag
## now for all query/flag pairs with bit scores that are within 90% of
+ the top_bit for that query
##
my @flag_list2=("Phage","Euk","Bact");
my $count={};
foreach my $q (keys %{$HoFlg2}){
print "$q\t";
for my $flag (@flag_list2){
for my $b (keys %{$HoFlg2->{$flag}}){
if ($b > 0.9*$maxBits{$q}){
$count->{$flag} += $HoFlg2->{$q}->{$fl
+ag}->{$b};
}
}
print "$count\t";
}
print "\n";
}
sampe input
158256496-stool1_revised_C972998_1_gene3 strand:- 581103
+.GY4MC1_2020 37.93 116 61 2 1 110 1
+ 111 1e-
158256496-stool1_revised_C972998_1_gene3 strand:- 539329
+.FTPG_01302 36.28 113 71 1 1 113 1
+ 112 1e-
158256496-stool1_revised_C972998_1_gene3 strand:- 634956
+.Geoth_2108 37.93 116 61 2 1 110 1
+ 111 1e-
V1.UC9-0_revised_scaffold1508_3_gene71913 strand:+ 565641
+.EFOG_02198 57.55 212 55 3 1 212 967
+ 1143 9e-
V1.UC9-0_revised_scaffold1508_3_gene71913 strand:+ 565640
+.EFMG_02145 57.55 212 55 3 1 212 967
+ 1143 9e-
V1.UC9-0_revised_scaffold1508_3_gene71913 strand:+ 565639
+.EFYG_01514 57.55 212 55 3 1 212 967
+ 1143 9e-
V1.UC9-0_revised_scaffold1508_3_gene71913 strand:+ 565638
+.EFCG_01989 57.55 212 55 3 1 212 967
+ 1143 9e-
curent output
158256496-stool1_revised_C972998_1_gene3 HASH(0x2b3c2bd3b000) HA
+SH(0x2b3c2bd3b000) HASH(0x2b3c2bd3b000)
V1.UC9-0_revised_scaffold1508_3_gene71913 HASH(0x2b3c2bd3b000) H
+ASH(0x2b3c2bd3b000) HASH(0x2b3c2bd3b000)
desired output
#query num_Phage num_Euk num_Bact
query_1 5 1 0
query_2 0 6 1
|