in reply to split a file into records and process it
A more complete parser now we've established the data format:
(Updated: simplified the references, by using a local %record during parsing. Same output)
(Update2: Removed need for duplicated Gene_id regex.)
(Update3: Corrected dumb context assignment error introduced during simplification.)
#! perl -slw use strict; use Data::Dump qw[ pp ]; my %records; until( eof( DATA ) ) { my %record; chomp( my $exon = <DATA> ); my $seqs = 1; my $line = <DATA>; if( $line =~ m[(\d+) different hits] ) { $seqs = $1; $line = <DATA>; } ( $record{ gene_id } ) = ( $line =~ m[GI:(\d+)] ); ( $record{ Nm_id } ) = ( <DATA> =~ m[(NM_\d[\d]+)] ); push @{ $record{ snoRNA_key } }, ( <DATA> =~ m[(\d+)]g ); for( 1 .. $seqs ) { my $query = [ split ' ', <DATA> ]; shift @$query; scalar (<DATA>); my $sbjct = [ split ' ', <DATA> ]; shift @$sbjct; push @{ $record{ seqs } }, { query => $query, sbjct => $sbjct +}; } chomp( $record{ gene_name } = <DATA> ); chomp( $record{ web_link } = <DATA> ); push @{ $records{ $exon } }, \%record; } pp \%records; __DATA__
Output:
C:\test>junk55 { 3 => [ { Nm_id => "NM_001040105", gene_id => 91982771, gene_name => "Homo sapiens mucin 17, cell surface associa +ted (MUC17), mRNA.", seqs => [ { query => [4, "TGGAGTCAAT", 13], sbjct => + [4854, "TGGAGTCAAT", 4845] }, ], snoRNA_key => [10], web_link => "http://www.ncbi.nlm.nih.gov/sites/entrez?cm +d=Retrieve&db=nucleotide&dopt=GenBank&RID=UDU305DZ01N&log%24=nuclalig +n&blast_rank=97&list_uids=91982771", }, { Nm_id => "NM_001100162", gene_id => 154448895, gene_name => "Homo sapiens exportin 7 (XPO7), transcript +variant 3, mRNA.", seqs => [ { query => [2, "CCTGGAGTCGAGTG", 15], sbjct => [146, "CCTGGAGTCGAGTG", 133], }, ], snoRNA_key => [25, 26, 27], web_link => "http://www.ncbi.nlm.nih.gov/sites/entrez?cm +d=Retrieve&db=nucleotide&dopt=GenBank&RID=UDW41RSS01S&log%24=nuclalig +n&blast_rank=2&list_uids=154448895", }, ], 4 => [ { Nm_id => "NM_206862", gene_id => 150418008, gene_name => "Homo sapiens transforming, acidic coiled-co +il containing protein 2 (TACC2), transcript variant 1, mRNA.", seqs => [ { query => [1, "ACCTGGAGTCGAG", 13], sbjct => [4775, "ACCTGGAGTCGAG", 4763], }, ], snoRNA_key => [25, 26, 27], web_link => "http://www.ncbi.nlm.nih.gov/sites/entrez?cm +d=Retrieve&db=nucleotide&dopt=GenBank&RID=UDW41RSS01S&log%24=nuclalig +n&blast_rank=10&list_uids=150418008", }, ], 31 => [ { Nm_id => "NM_002458", gene_id => 153945877, gene_name => "Homo sapiens mucin 5B, oligomeric mucus/gel +-forming (MUC5B), mRNA.", seqs => [ { query => [3, "CTGGAGTCGAGTG", 15], sbjct => [6818, "CTGGAGTCGAGTG", 6806], }, { query => [3, "CTGGAGTCGAGTG", 15], sbjct => [8489, "CTGGAGTCGAGTG", 8477], }, { query => [3, "CTGGAGTCGAGTG", 15], sbjct => [10589, "CTGGAGTCGAGTG", 10577] +, }, { query => [3, "CTGGAGTCGAGTG", 15], sbjct => [12260, "CTGGAGTCGAGTG", 12248] +, }, ], snoRNA_key => [25, 26, 27], web_link => "http://www.ncbi.nlm.nih.gov/sites/entrez?cm +d=Retrieve&db=nucleotide&dopt=GenBank&RID=UDW41RSS01S&log%24=nuclalig +n&blast_rank=9&list_uids=153945877", }, ], }
|
|---|