#!usr/bin/perl -w use strict; use Data::Dumper; my %Gene_DB; # a hash of hash while () { my $gene = (/^Gene:\s+(\S+)\s*/)[0] || next; add_gene($gene); } sub add_gene { my $gene = shift; my @tokens; my %gene_hash; while () { last if (/^-/); #End of Record next unless (/^\d/); #the single line we care about! @tokens = split; } #I probably don't understand OP right! #but first 3 on line, last 2 on line, then what's left over my ($Epitope, $Sequence, $Location) = splice(@tokens, 0, 3); my ($Strain, $Confidence) = splice(@tokens, -2); my ($Protein) = join (" ",@tokens); @gene_hash{'Epitope', 'Sequence', 'Location' ,'Strain', 'Confidence' ,'Protein'} = ($Epitope, $Sequence, $Location,$Strain, $Confidence,$Protein); $Gene_DB{$gene}=\%gene_hash; } print Dumper (\%Gene_DB); #prints: #$VAR1 = { # 'PF14_0747' => { # 'Protein' => 'Plasmodium falciparum', # 'Epitope' => '26850', # 'Confidence' => 'Medium', # 'Strain' => '3D7', # 'Location' => '1914-1917', # 'Sequence' => 'IKND' # }, # 'PF14_0711' => { # 'Protein' => 'Plasmodium falciparum', # 'Epitope' => '26850', # 'Confidence' => 'Medium', # 'Strain' => '3D7', # 'Location' => '9-12', # 'Sequence' => 'IKND' # } # }; # __DATA__ Gene: PF14_0747 TABLE: Epitopes from IEDB Epitope Sequence Location on Protein Strain Confidence 26850 IKND 1914-1917 Plasmodium falciparum 3D7 Medium ------------------------------------------------------------ Gene: PF14_0711 TABLE: Epitopes from IEDB Epitope Sequence Location on Protein Strain Confidence 26850 IKND 9-12 Plasmodium falciparum 3D7 Medium ------------------------------------------------------------