while () { if (/^<160> NUMBER OF SEQ ID NOS: (\d+)$/) { $seq_num = $1; } elsif (/^<140> CURRENT APPLICATION NUMBER: ([\w\/,]+)$/) { $appl_num = $1; } elsif (/^<210> SEQ ID NO (\d+)$/) { $cur_seq = $1; $seq{$cur_seq} = {}; } elsif (/^<211> LENGTH: (\d+)$/) { $seq{$cur_seq}{length} = $1; } elsif (/^<212> TYPE: (DNA|PRT)$/) { $seq{$cur_seq}{type} = $1; } elsif (/^<213> ORGANISM: (\w+ \w+)$/) { $seq{$cur_seq}{organism} = $1; } elsif (/^<220> FEATURE:([\w\s]*)$/) { $seq{$cur_seq}{feature} = $1; } elsif (/^<221> NAME\/KEY: (\w+)$/) { $seq{$cur_seq}{name_key} = $1; } elsif (/^<222> LOCATION: ([\d\.\(\)]+)$/) { $seq{$cur_seq}{location} = $1; } elsif (/^<223> OTHER INFORMATION: ([^<]+)$/) { $seq{$cur_seq}{other} = $1; } elsif (/^<400> SEQUENCE: (\d+)$/) { $seq{$cur_seq}{seq_num} = $1; } elsif (/^([\w\s]+)$/) { $seq{$cur_seq}{seq} .= $1; } else { die "Unrecognized line: $_\n"; } } foreach my $cur_seq (keys %seq) { print "Sequence: $cur_seq, Organism: $seq{$cur_seq}{organism}\n"; print "seq: $seq{$cur_seq}{seq}\n\n"; } __DATA__ <160> NUMBER OF SEQ ID NOS: 727 <140> CURRENT APPLICATION NUMBER: US/09/984,429 <210> SEQ ID NO 1 <211> LENGTH: 733 <212> TYPE: DNA <213> ORGANISM: Homo sapiens <400> SEQUENCE: 1 gggatccgga gcccaaatct tctgacaaaa ctcacacatg cccaccgtgc ccagcacctg 60 aattcgaggg tgcaccgtca gtcttcctct tccccccaaa acccaaggac accctcatga 120 tctcccggac tcctgaggtc acatgcgtgg tggtggacgt aagccacgaa gaccctgagg 180 tcaagttcaa ctggtacgtg gacggcgtgg aggtgcataa tgccaagaca aagccgcggg 240 aggagcagta caacagcacg taccgtgtgg tcagcgtcct caccgtcctg caccaggact 300 ggctgaatgg caaggagtac aagtgcaagg tctccaacaa agccctccca acccccatcg 360 agaaaaccat ctccaaagcc aaagggcagc cccgagaacc acaggtgtac accctgcccc 420 catcccggga tgagctgacc aagaaccagg tcagcctgac ctgcctggtc aaaggcttct 480 atccaagcga catcgccgtg gagtgggaga gcaatgggca gccggagaac aactacaaga 540 ccacgcctcc cgtgctggac tccgacggct ccttcttcct ctacagcaag ctcaccgtgg 600 acaagagcag gtggcagcag gggaacgtct tctcatgctc cgtgatgcat gaggctctgc 660 acaaccacta cacgcagaag agcctctccc tgtctccggg taaatgagtg cgacggccgc 720 gactctagag gat 733 <210> SEQ ID NO 2 <211> LENGTH: 5 <212> TYPE: PRT <213> ORGANISM: Homo sapiens <220> FEATURE: <221> NAME/KEY: Site <222> LOCATION: (3) <223> OTHER INFORMATION: Xaa equals any of the twenty naturally ocurring L-amino acids <400> SEQUENCE: 2 Trp Ser Xaa Trp Ser 1 5