in reply to split a file into records and process it

A more complete parser now we've established the data format:

(Updated: simplified the references, by using a local %record during parsing. Same output)

(Update2: Removed need for duplicated Gene_id regex.)

(Update3: Corrected dumb context assignment error introduced during simplification.)

#! perl -slw use strict; use Data::Dump qw[ pp ]; my %records; until( eof( DATA ) ) { my %record; chomp( my $exon = <DATA> ); my $seqs = 1; my $line = <DATA>; if( $line =~ m[(\d+) different hits] ) { $seqs = $1; $line = <DATA>; } ( $record{ gene_id } ) = ( $line =~ m[GI:(\d+)] ); ( $record{ Nm_id } ) = ( <DATA> =~ m[(NM_\d[\d]+)] ); push @{ $record{ snoRNA_key } }, ( <DATA> =~ m[(\d+)]g ); for( 1 .. $seqs ) { my $query = [ split ' ', <DATA> ]; shift @$query; scalar (<DATA>); my $sbjct = [ split ' ', <DATA> ]; shift @$sbjct; push @{ $record{ seqs } }, { query => $query, sbjct => $sbjct +}; } chomp( $record{ gene_name } = <DATA> ); chomp( $record{ web_link } = <DATA> ); push @{ $records{ $exon } }, \%record; } pp \%records; __DATA__

Output:

C:\test>junk55 { 3 => [ { Nm_id => "NM_001040105", gene_id => 91982771, gene_name => "Homo sapiens mucin 17, cell surface associa +ted (MUC17), mRNA.", seqs => [ { query => [4, "TGGAGTCAAT", 13], sbjct => + [4854, "TGGAGTCAAT", 4845] }, ], snoRNA_key => [10], web_link => "http://www.ncbi.nlm.nih.gov/sites/entrez?cm +d=Retrieve&db=nucleotide&dopt=GenBank&RID=UDU305DZ01N&log%24=nuclalig +n&blast_rank=97&list_uids=91982771", }, { Nm_id => "NM_001100162", gene_id => 154448895, gene_name => "Homo sapiens exportin 7 (XPO7), transcript +variant 3, mRNA.", seqs => [ { query => [2, "CCTGGAGTCGAGTG", 15], sbjct => [146, "CCTGGAGTCGAGTG", 133], }, ], snoRNA_key => [25, 26, 27], web_link => "http://www.ncbi.nlm.nih.gov/sites/entrez?cm +d=Retrieve&db=nucleotide&dopt=GenBank&RID=UDW41RSS01S&log%24=nuclalig +n&blast_rank=2&list_uids=154448895", }, ], 4 => [ { Nm_id => "NM_206862", gene_id => 150418008, gene_name => "Homo sapiens transforming, acidic coiled-co +il containing protein 2 (TACC2), transcript variant 1, mRNA.", seqs => [ { query => [1, "ACCTGGAGTCGAG", 13], sbjct => [4775, "ACCTGGAGTCGAG", 4763], }, ], snoRNA_key => [25, 26, 27], web_link => "http://www.ncbi.nlm.nih.gov/sites/entrez?cm +d=Retrieve&db=nucleotide&dopt=GenBank&RID=UDW41RSS01S&log%24=nuclalig +n&blast_rank=10&list_uids=150418008", }, ], 31 => [ { Nm_id => "NM_002458", gene_id => 153945877, gene_name => "Homo sapiens mucin 5B, oligomeric mucus/gel +-forming (MUC5B), mRNA.", seqs => [ { query => [3, "CTGGAGTCGAGTG", 15], sbjct => [6818, "CTGGAGTCGAGTG", 6806], }, { query => [3, "CTGGAGTCGAGTG", 15], sbjct => [8489, "CTGGAGTCGAGTG", 8477], }, { query => [3, "CTGGAGTCGAGTG", 15], sbjct => [10589, "CTGGAGTCGAGTG", 10577] +, }, { query => [3, "CTGGAGTCGAGTG", 15], sbjct => [12260, "CTGGAGTCGAGTG", 12248] +, }, ], snoRNA_key => [25, 26, 27], web_link => "http://www.ncbi.nlm.nih.gov/sites/entrez?cm +d=Retrieve&db=nucleotide&dopt=GenBank&RID=UDW41RSS01S&log%24=nuclalig +n&blast_rank=9&list_uids=153945877", }, ], }

Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
"Science is about questioning the status quo. Questioning authority".
In the absence of evidence, opinion is indistinguishable from prejudice.
"I'd rather go naked than blow up my ass"