#!/usr/bin/perl # # Parse simple FASTA text, chunk by chunk use strict; use warnings; my $term = $/; my $fastafile = 'fasta.txt'; my $pos = 0; my $id; my @sequencelines; my $sequence; my $line; open(FASTA,"<",$fastafile) or die("Open failed: $!"); $/ = ">"; while() { chomp; # Since the file begins with ">", the first extraction will # contain only that '>', which will then get chomped, so we'll # have a blank line to skip. next if($_ eq ''); ($id,@sequencelines) = split /\n/; # I'm not sure if the ID is supposed to include the '>' in front # of it or not, but if so, we can put it back. $id = '>' . $id; print "Found ID '",$id,"' at position ",$pos,":\n"; $sequence = ''; foreach $line (@sequencelines) { print $line,"\n"; $sequence .= $line; } print "\n"; $pos++; } $/ = $term; #### Found ID '>ELKSMKO02JGD0L' at position 0: TCAGGAATCTAATACTCAAGCTGTGGCCTATCCAGTACAACATGTAGCGAGACAATAATATCTCAGGATC TGAATACACCCCTTCTGTTAAAATGCAGTCTAGGATTACACTAGCTTTGTTCACAGCCACGTAACACCAC TGACTCACATGAAGACTGAAGACAACACAACCCCCCACATCTTGTTCACAAAAACTGGTAGCATGCCAGG TCTTCCATATCTTTACAGGACACTTGGTATTTTACAAAACTTAATTC Found ID '>ELKSMKO02FEYZW' at position 1: TCAGTCATAATGTCATTTCTTCAAAACTTGATCTGTAGATTTAATGGAACCCCAATCAAAATTCCAGCAA ATTATTAAGTGGATATCCACAAACTGGTTCAAAAGTTTATATGAAATAACAAAAGATCCAGAACAGCCAA CATAATATTGAAGGAGAAGAATGAAGTTCGAGGTCTAACAAACTAATTTCTGTATGATTCCAACTACATG GCATTCTGGAAAATGAAAAATTACAGACACAGTAAATAGCTCAGTGATTGCCAGGTAGG Found ID '>ELKSMKO02IX3A4' at position 2: TCAGTCCCAACGTGCTGGGAGGGCGTGAGCCACGGTGCCCAGCCTTTTTATTTTTTATTTTTATTTTTAA TCTGTCTTGATTTTGCTTCCTTCCTAAACAGTTTTGGCTTCGTGATCACGTAAACCAAGAGTCACAAACT GAAATGCCATCAAGGGGCCAAGCAGGTAACAAAATTCAAGTCATACAGGTTCAATGTCTTAGTCACCCCA GGCTACAACAGAATATCATAGACTGGGTANCCTAATAATACAGATCATTTTCNCATGGTTCTAGAGGAC