use strict; use warnings; my @records; my $currTail; my $currField; while (defined(my $line = ) or defined $currField) { my $field; my $tail; ($field, $tail) = $line =~ /^(.{10}) (.*)/ if defined $line; next if !defined $tail && !defined $currField; $field =~ tr/ //d if defined $field; $currField //= $field; if (! defined $field or (length $field && $currField ne $field)) { push @records, {} if $currField eq 'LOCUS'; $records[-1]{$currField} = $currTail; $currField = undef; $currTail = undef; last if !defined $tail; } $currField = $field if length $field; push @$currTail, $tail if defined $tail; } for my $record (@records) { print "$_:\n", map{" $_\n"} @{$record->{$_}} for sort keys %$record; } __DATA__ LOCUS NM_001098209 3415 bp mRNA linear PRI 27-APR-2014 DEFINITION Homo sapiens catenin (cadherin-associated protein), beta 1, 88kDa (CTNNB1), transcript variant 2, mRNA. ACCESSION NM_001098209 XM_001133660 XM_001133664 XM_001133673 XM_001133675 VERSION NM_001098209.1 GI:148233337 KEYWORDS RefSeq. SOURCE Homo sapiens (human) ORGANISM Homo sapiens Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; Homo. CDS 269..2614 /gene="CTNNB1" /gene_synonym="armadillo; CTNNB; MRD19" /codon_start=1 /product="catenin beta-1" /protein_id="NP_001091679.1" /db_xref="GI:148233338" /db_xref="CCDS:CCDS2694.1" /db_xref="GeneID:1499" /db_xref="HGNC:HGNC:2514" /db_xref="MIM:116806" /translation="MATQADLMELDMAMEPDRKAAVSHWQQQSYLDSGIHSGATTTAP SLSGKGNPEEEDVDTSQVLYEWEQGFSQSFTQEQVADIDGQYAMTRAQRVRAAMFPET LDEGMQIPSTQFDAAHPTNVQRLAEPSQMLKHAVVNLINYQDDAELATRAIPELTKLL //