inputfile: >1_1 geneid1 34 45 len=10 AGTCGA GCAA >1_2 geneid1 54 75 len=21 AGTCGAAGTCGA ACAAACAAT >2_1 geneid1 78 83 len=5 CGTCG >1_1 geneid2 14 25 len=11 AGTCGAA GCAA >2_1 geneid2 4 12 len=8 AGTCGAAT >2_3 geneid2 19 27 len=8 AGTC GCAA >2_2 geneid2 89 95 len=6 AAAAAA --------------------------- facts: 1) this is just a sample. but the real file will be 2 GB in size. the AGTC bases wil be in 1000's to millions. problem: 1) i have to join the sequences belonging to the same number and also same geneid. 2) the output file wil be a list of sequences (each sequence wil be a set of numbers joined together but have one header {which is my problem now that i am not able to print it} ). --------------------------------- sample output file: >1 gid1 AGTCGA GCAA AGTCGAAGTCGA ACAAACAAT >2 gid1 CGTCG >1 gid2 AGTCGAA GCAA >2 gid2 AGTCGAAT AAAAAA AGTC GCAA #### use strict; use warnings; my @AoA = (); MAIN: while(){ if (/^>(\d+)_(\d+)\s+geneid(\d+)/o) { my ($tops, $mids, $subs) = ($3, $1, $2); $tops -= 1; $mids -= 1; $subs -= 1; SUB: while(){ redo MAIN unless (/^[ACGT]/o); chomp; push @{$AoA[$tops][$mids][$subs]}, $_; } } } for my $i (@AoA) { for my $j (@{$i}) { for my $n (@{$j}) { for my $r (@{$n}) { print $r,"\n"; } } } } __DATA__ >1_1 geneid1 34 45 len=10 AGTCGA GCAA >1_2 geneid1 54 75 len=21 AGTCGAAGTCGA ACAAACAAT >2_1 geneid1 78 83 len=5 CGTCG >1_1 geneid2 14 25 len=11 AGTCGAA GCAA >2_1 geneid2 4 12 len=8 AGTCGAAT >2_3 geneid2 19 27 len=8 AGTC GCAA >2_2 geneid2 89 95 len=6 AAAAAA