use strict; use warnings; use Data::Dumper; my $in_fasta=$ARGV[0]; open(IN,$in_fasta) or die "cannot open $in_fasta"; my $out_fasta="$in_fasta.gi_header"; open(OUT,">",$out_fasta); my $err_file="$in_fasta.headers_wNOgi"; open(ERR,">",$err_file); my $header_count=0; my $gi_count=0; my $protid_count=0; my %giTogeneid; while (my $line=){ if ($line =~ /^>/ && $line =~ /^>.+GI="(\w+)"/){ my $gi=$1; $gi_count ++; my @header_columns=split(/\s+/,$line); my $SM_geneid=$header_columns[0]; $SM_geneid=~s/^>//g; $giTogeneid{$gi}=$SM_geneid; print OUT ">gi$gi\n"; } elsif ($line =~ /^>/ && $line !~ /^>.+GI="(\w+)"/ && $line !~ /^>.+protein_id="(\w+)"/){ print ERR "$line\n"; } elsif ($line =~ /^>/ && $line =~ /^>.+protein_id="(\w+)"/){ my $protid=$1; $protid_count++; print OUT ">$protid\n"; my @header_columns=split(/\s+/,$line); my $SM_geneid=$header_columns[0]; $SM_geneid=~s/^>//g; $giTogeneid{$protid}=$SM_geneid; } elsif ($line !~ /^>/){ print OUT $line; } } close(IN); print "number of headers seen\t$header_count\n"; print "number of gis seen\t$gi_count\n"; print "number of protids seen\t$protid_count\n"; close(OUT);