in reply to split a file into records and process it
#! /usr/bin/perl use strict; use warnings; use Data::Dumper; # May appear an attribute more than once in a record? # Sorry this is not DRY, the attribute names are duplicated # here and in the parser regex. my %is_single = ( 'exon' => 1, 'gene_id' => 1, 'product_id' => 1, 'sno_rna' => 1, 'query_subject' => 0, 'gene_name' => 1, 'link' => 1, 'other' => 0, ); # You can use split, just match the null string # before the real match in a look-ahead. my @records = split /^(?=\d+$)/m, do { local $/; <DATA> }; # An array of hash of something, one item / record. my @parsed_records; #my %sno_records; for (@records) { my %record; # You probably want to eliminate those ugly trailing spaces first # and then leave out the '\s*' parts just before '$'. my $re = qr{ (?: ^ (?<exon> \d+ ) \s* $ ) | (?: ^ GI:\s* (?<gene_id> \d+ ) \s* $ ) | (?: ^ NM_ (?<product_id> \d+\.\d ) \s* $ ) | (?: ^ snoRNA\s+ (?<sno_rna> .+ ) \s* $ ) | (?s: ^ (?<query_subject> Query .*? Sbjct .*? ) \s* $ ) | (?i: ^ (?<gene_name> Homo \s sapiens .* ) \s* $ ) | (?: ^ (?<link> http://.* ) \s* $ ) | (?: ^ (?<other> .+ ) \s* $ ) # Order of branches matters, leave (?<other>) at the very end. }mx; while (m/$re/gc) { my ( $key ) = keys %+; my ( $val ) = values %+; # If a key can appear only once then simply store it. if ( $is_single{$key} ) { $record{$key} = $val; } # Else put it into an array. else { push @{ $record{$key} }, $val; } } # This @parsed_records is _not_ keyed by sno_rna, as it # seemed unnatural for me with the provided sample data. push @parsed_records, \%record; # But you can easily transform it to a data structure keyed by sno_r +na # just uncomment the lines related to %sno_records. #push @{ $sno_records{ $record{sno_rna} } }, \%record; #delete $record{sno_rna}; } print Dumper( \@parsed_records ); #print Dumper( \%sno_records ); __DATA__
3 GI:91982771 NM_001040105.1 snoRNA 10 Query 4 TGGAGTCAAT 13 |||||||||| Sbjct 4854 TGGAGTCAAT 4845 Homo sapiens mucin 17, cell surface associated (MUC17), mRNA. http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=nucleotide&do +pt=GenBank&RID=UDU305DZ01N&log%24=nuclalign&blast_rank=97&list_uids=9 +1982771 3 GI:154448895 NM_001100162.1 snoRNA 25, 26 and 27 Query 2 CCTGGAGTCGAGTG 15 |||||||||||||| Sbjct 146 CCTGGAGTCGAGTG 133 Homo sapiens exportin 7 (XPO7), transcript variant 3, mRNA. http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=nucleotide&do +pt=GenBank&RID=UDW41RSS01S&log%24=nuclalign&blast_rank=2&list_uids=15 +4448895 31 4 different hits GI:153945877 NM_002458.1 snoRNA 25, 26 and 27 Query 3 CTGGAGTCGAGTG 15 ||||||||||||| Sbjct 6818 CTGGAGTCGAGTG 6806 Query 3 CTGGAGTCGAGTG 15 ||||||||||||| Sbjct 8489 CTGGAGTCGAGTG 8477 Query 3 CTGGAGTCGAGTG 15 ||||||||||||| Sbjct 10589 CTGGAGTCGAGTG 10577 Query 3 CTGGAGTCGAGTG 15 ||||||||||||| Sbjct 12260 CTGGAGTCGAGTG 12248 Homo sapiens mucin 5B, oligomeric mucus/gel-forming (MUC5B), mRNA. http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=nucleotide&do +pt=GenBank&RID=UDW41RSS01S&log%24=nuclalign&blast_rank=9&list_uids=15 +3945877 4 GI:150418008 NM_206862.2 snoRNA 25, 26 and 27 Query 1 ACCTGGAGTCGAG 13 ||||||||||||| Sbjct 4775 ACCTGGAGTCGAG 4763 Homo sapiens transforming, acidic coiled-coil containing protein 2 (TA +CC2), transcript variant 1, mRNA. http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=nucleotide&do +pt=GenBank&RID=UDW41RSS01S&log%24=nuclalign&blast_rank=10&list_uids=1 +50418008
|
|---|