Then I'd suggest that rather than have a single loop reading lines, and then having conditionals to decide what to do with each type of line, you use a loop that terminates on eof and reads the individual lines of each record in line. This makes for a more robust parser with less confusing conditional code and line to line state.
This doesn't do the final extraction of the required parts from individual liens of the records, which is easily added, but serves to demonstrate the technique:
#! perl -slw
use strict;
use Data::Dump qw[ pp ];
my %records;
until( eof( DATA ) ) {
chomp( my $exon = <DATA> );
push @{ $records{ $exon } }, {};
my $seqs = 1;
my $line = <DATA>;
if( $line =~ m[(\d+) different hits] ) {
$seqs = $1;
chomp( $records{ $exon }[ -1 ]{ gene_id } = <DATA> );
}
else {
chomp( $records{ $exon }[ -1 ]{ gene_id } = $line );
}
chomp( $records{ $exon }[ -1 ]{ Nm_id } = <DATA> );
chomp( $records{ $exon }[ -1 ]{ snoRNA_key } = <DATA> );
for( 1 .. $seqs ) {
chomp( my $query = <DATA> );
scalar (<DATA>);
chomp( my $sbjct = <DATA> );
push @{ $records{ $exon }[ -1 ]{ seqs } }, { $query => $sbjct
+};
}
chomp( $records{ $exon }[ -1 ]{ gene_name } = <DATA> );
chomp( $records{ $exon }[ -1 ]{ web_link } = <DATA> );
}
pp \%records;
__DATA__
3
GI:91982771
NM_001040105.1
snoRNA 10
Query 4 TGGAGTCAAT 13
||||||||||
Sbjct 4854 TGGAGTCAAT 4845
Homo sapiens mucin 17, cell surface associated (MUC17), mRNA.
http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=nucleotide&do
+pt=GenBank&RID=UDU305DZ01N&log%24=nuclalign&blast_rank=97&list_uids=9
+1982771
3
GI:154448895
NM_001100162.1
snoRNA 25, 26 and 27
Query 2 CCTGGAGTCGAGTG 15
||||||||||||||
Sbjct 146 CCTGGAGTCGAGTG 133
Homo sapiens exportin 7 (XPO7), transcript variant 3, mRNA.
http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=nucleotide&do
+pt=GenBank&RID=UDW41RSS01S&log%24=nuclalign&blast_rank=2&list_uids=15
+4448895
31
4 different hits
GI:153945877
NM_002458.1
snoRNA 25, 26 and 27
Query 3 CTGGAGTCGAGTG 15
|||||||||||||
Sbjct 6818 CTGGAGTCGAGTG 6806
Query 3 CTGGAGTCGAGTG 15
|||||||||||||
Sbjct 8489 CTGGAGTCGAGTG 8477
Query 3 CTGGAGTCGAGTG 15
|||||||||||||
Sbjct 10589 CTGGAGTCGAGTG 10577
Query 3 CTGGAGTCGAGTG 15
|||||||||||||
Sbjct 12260 CTGGAGTCGAGTG 12248
Homo sapiens mucin 5B, oligomeric mucus/gel-forming (MUC5B), mRNA.
http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=nucleotide&do
+pt=GenBank&RID=UDW41RSS01S&log%24=nuclalign&blast_rank=9&list_uids=15
+3945877
4
GI:150418008
NM_206862.2
snoRNA 25, 26 and 27
Query 1 ACCTGGAGTCGAG 13
|||||||||||||
Sbjct 4775 ACCTGGAGTCGAG 4763
Homo sapiens transforming, acidic coiled-coil containing protein 2 (TA
+CC2), transcript variant 1, mRNA.
http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=nucleotide&do
+pt=GenBank&RID=UDW41RSS01S&log%24=nuclalign&blast_rank=10&list_uids=1
+50418008
Output:
C:\test>junk55
{
3 => [
{
Nm_id => "NM_001040105.1",
gene_id => "GI:91982771",
gene_name => "Homo sapiens mucin 17, cell surface associa
+ted (MUC17), mRNA.",
seqs => [
{
"Query 4 TGGAGTCAAT 13" => "Sbjct
+ 4854 TGGAGTCAAT 4845",
},
],
snoRNA_key => "snoRNA 10",
web_link => "http://www.ncbi.nlm.nih.gov/sites/entrez?cm
+d=Retrieve&db=nucleotide&dopt=GenBank&RID=UDU305DZ01N&log%24=nuclalig
+n&blast_rank=97&list_uids=91982771",
},
{
Nm_id => "NM_001100162.1",
gene_id => "GI:154448895",
gene_name => "Homo sapiens exportin 7 (XPO7), transcript
+variant 3, mRNA.",
seqs => [
{
"Query 2 CCTGGAGTCGAGTG 15" => "Sbj
+ct 146 CCTGGAGTCGAGTG 133",
},
],
snoRNA_key => "snoRNA 25, 26 and 27",
web_link => "http://www.ncbi.nlm.nih.gov/sites/entrez?cm
+d=Retrieve&db=nucleotide&dopt=GenBank&RID=UDW41RSS01S&log%24=nuclalig
+n&blast_rank=2&list_uids=154448895",
},
],
4 => [
{
Nm_id => "NM_206862.2",
gene_id => "GI:150418008",
gene_name => "Homo sapiens transforming, acidic coiled-co
+il containing protein 2 (TACC2), transcript variant 1, mRNA.",
seqs => [
{
"Query 1 ACCTGGAGTCGAG 13" => "Sbj
+ct 4775 ACCTGGAGTCGAG 4763",
},
],
snoRNA_key => "snoRNA 25, 26 and 27",
web_link => "http://www.ncbi.nlm.nih.gov/sites/entrez?cm
+d=Retrieve&db=nucleotide&dopt=GenBank&RID=UDW41RSS01S&log%24=nuclalig
+n&blast_rank=10&list_uids=150418008",
},
],
31 => [
{
Nm_id => "NM_002458.1",
gene_id => "GI:153945877",
gene_name => "Homo sapiens mucin 5B, oligomeric mucus/gel
+-forming (MUC5B), mRNA.",
seqs => [
{
"Query 3 CTGGAGTCGAGTG 15" => "Sbj
+ct 6818 CTGGAGTCGAGTG 6806",
},
{
"Query 3 CTGGAGTCGAGTG 15" => "Sbj
+ct 8489 CTGGAGTCGAGTG 8477",
},
{
"Query 3 CTGGAGTCGAGTG 15" => "Sb
+jct 10589 CTGGAGTCGAGTG 10577",
},
{
"Query 3 CTGGAGTCGAGTG 15" => "Sb
+jct 12260 CTGGAGTCGAGTG 12248",
},
],
snoRNA_key => "snoRNA 25, 26 and 27",
web_link => "http://www.ncbi.nlm.nih.gov/sites/entrez?cm
+d=Retrieve&db=nucleotide&dopt=GenBank&RID=UDW41RSS01S&log%24=nuclalig
+n&blast_rank=9&list_uids=153945877",
},
],
}
Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
"Science is about questioning the status quo. Questioning authority".
In the absence of evidence, opinion is indistinguishable from prejudice.
|