perl_n00b has asked for the wisdom of the Perl Monks concerning the following question:
One of my problems is my ($sequence) = $chunk =~ /([a|c|t|g]+)\s+/; I know this isn't right, but I have gone through a lot of different syntax's for this to no avail. What's the best way to parse the sequence out?use strict; use warnings; $/ = ">"; my $fastafile = 'j:\summer\begomo_genomes2.fasta'; my ($OUT, $IN); print "Input: $fastafile\n"; open my $ifh, "<", $fastafile or die "cannot open $fastafile: $!\n"; while (my $chunk = <$ifh>){ $chunk = lc $chunk; my ($accession) = $chunk =~ /gi\|(\d+)/; my ($acronym) = $chunk =~ /\|\s+(.*),/; $acronym =~ s/\[|\]|\:|\d+|\-\///g; $acronym =~ s/\s/_/g; $acronym =~ s/dnaa/dna_a/g; $acronym =~ s/dnab/dna_b/g; my ($sequence) = $chunk =~ /([a|c|t|g]+)\s+/; my $outfile = "j:\\summer\\begomo\\${accession}_${acronym}.seq"; open my $ofh, ">", $outfile or die "cannot open $outfile: $!\n"; print "Creating $outfile\n"; print $ofh "$accession $acronym\n^^\n$sequence"; close $ofh; }
>gi|9626081|ref|NC_001359.1| Pepper huasteco yellow vein virus DNA A, +complete sequence GGCCATCCGTTATAATATTACCGGATGGCCGACCGCTTACCTTATCTATCCGTACTGCTTTATTTGAATT AAAGATGTTACTTTTATGCTATCCAATGAAGCGTAGCGTCTGGGAAGCTTAGTTATCAGTTCCAGACGTG GGGACCAAGTAGTGTATGACCACTTTATTGACTGTCAGCTTTATAAATTGAAATTAAAACATAAGTGGTC CATGTACCTTTAATTCAAAATGCCTAAGCGTGATGCTCCTTGGCGATTAACGGCGGGGACCGCCAAGATT AGCCGAACTGGCAATAATTCACGGGCTCTTATCATGGGCCCGAGTACTAGCAGGGCCTCAGCTTGGGTTA ATCGCCCAATGTACAGGAAGCCCCGGATTTATCGTATGTACAGAACTCCGGATGTGCCGAAAGGTTGTGA AGGTCCCTGTAAGGTTCAATCGTTTGAACAACGACATGACGTCTCTCATGTTGGTAAGGTTATTTGTATA TCCGACGTAACTCGTGGTAATGGTATTACCCATCGTGTTGGCAAACGATTCTGCGTTAAGTCTGTCTATA TTCTGGGCAAAATCTGGATGGATGAAAATATTAAGTTGAAGAACCATACCAACAGTGTCATGTTTTGGTT GGTTAGGGATAGGAGACCCTACGGTACGCCTATGGATTTTGGCCAAGTCTTTAACATGTATGACAACGAG CCCAGTACCGCTACTGTGAAGAACGATCTTCGGGATCGTTATCAAGTTATGCATAGATTCTATGCTAAGG TCACTGGTGGGCAATATGCAAGCAACGAGCAAGCCTTGGTTAGGCGTTTCTGGAAGGTGAACAACCATGT TGTGTATAACCATCAAGAAGCTGGGAAATATGAGAACCACACGGAGAATGCGCTGTTATTGTATATGGCA TGTACTCATGCATCTAATCCCGTGTATGCAACACTCAAAATTCGGGTCTATTTTTATGACTCGATAATGA ATTAATAAAGTTTGTATATTATTTCATGATTCTCAAGTACAGCATTGACATAACGTTTGTCTGTAGCAAA CGAAACAGCCCTAATTACATTGTTAACTGAAATAAGACCTAAGTTATCTAGATAAAACATGACAAGCAAT TTAAATCTATTTAAGTAAATCTGCCCAGAAATCGTCGTCAACGTCGTCCAGACTTGGAAGTTGAAGTAGG CTTTGTGGAGACCCAACGCTGTCCTCATGTTGTGGTTTGCTCTGACTTGAATGTGAAATACCCGGCTGCG TGTGTACATTGGCGTCTCCACTAGCCGTATTTTGAAATATAGGGGATTTCGAAGCTCCCAGATAAAAACG CCATTCGCTGCTTGAGCTGCAGTGATGGGTACCCCGGTGCGTAAATCCATTGTTAACACAGTTAATATGT ATATAAATTGAACAGCCGCAAGCGAGATCAATCCTTCTACGTCGTATCTGTCTCTTTGCAAATCTATGGC GAAGTTTGACTTCCGGTGGTGAAGATAGCTTCTTCGATGGTGACGTAGATGGCGTTTTTTTGGACCCAGT CATTGAGGCTCCTATTTTTCTCTTCGCTGAGGTAGTCTTTATAGGAAGACTGGGGGCCCGGATTGCAAAG AAAGATAGTGGGTATCCCACCTTTAATTTGAATTGGTTTGCCGTATTTGCAGTTTGATTGCCAATCCCGT TGGGCCCCCATAAACTCTTTAAAGTGCTTAACATAATGCGGAGGGATGTCATCAATGACGTTATACCATG CATTATTGGAGTAGATTTTTGGGCTGAGATCCATATGACCACATATGTAATTGTGTGGGCCGAGACTTCG GGCCCATAATGTTTTGCCTGTCCGTGAAGGACCCTCGACCACTAATGACAATGGTCTCATTGGCCGCGCA GCGGCATCACATACATTATCAGACACCCATTGTGTCATTATTGCAGGCACATTATTAAAGGACGCCTGTT GAAATGGAGGAACCCACGGTTCCGGGGGAGTTTGGAATATCCGATTAGCGTTTGACACAATGTTATGAAA TTGGAGGAAGAAATGCTGAGGTTGTTCTTCCTTTATGATCTGCAGAGCTTCTTCTGCAGATGCTGAATTT AACGCCTTAGCATATGTGTCATTAGCAGACTGCTGTCCTCCTCTAGCAGATCTGCCGTCTATTTGGAATT CTCCCCATTCTACGGTATCGCCGTCTTTGTCGATGTACGTCTTGACGTCGGAGCTTGATTTAGCTCCCTG AATGTTCGGATGGAAATGTGCTGATCTGGTAGAGGATACGAGGTCAAAGAATCGGTTGTTCGTGCATTGG TATTTTCCTTCGAACTGAATAAGCACGTGCAGATGAGGTTGCCCATCTTCATGAGATTCTTTGCAAATTT TGATGTACTTCTTGTTTACCGGCGTCGAGAGGTTTTGTAGTTGAGCGAGACGCTCTTCTTTGGAAATGGA ACATTGTGGATAGGTGAGGAAATAATTCTTGGCATTTAAACGAAATCGTTTAGGTAATGGCATATTTGTA ATAAGAGAGGTGTACACCGATTGGAGCTCTTTAACCTGGGCTTATTGTATCGGTGTATTGGTAGCCAATA TATAGTATATGGGAGTTATCTAGGATCTTCGTACACGTGAG >gi|9626131|ref|NC_001369.1| Pepper huasteco yellow vein virus DNA B, +complete sequence GGCCATCCGTTATAATATTACCGGATGGCCGACCGCTTCCACTCTCTTTCCTTTGGGACAGCTGGCGCGC ACTATGTATTATGTTTACGTGGCATCATGTGGGTCGTTGGATGAATTCAATCGCGCGCCTTCATTTCAAA TTAAAGTGTGTGTCCATACATCGAGAAATGTGTAATGACGTGGAGCGTTCTCCACCATTCCTGAATCGTT AGATAATTGTTTGACCAGGACCACAGCTGTCATTTGGGACCACACGTCCTTTGGGACCACCACTATAATG ATAATGTTTCCTGTTATTGCGGTCCACGTGGTCCAATTAAATTGCACCTCGCGAGTCTACATATCCACAA TTTTGAATATCCTATTCTATAAAATGGCTTCCATTTTTATATTCAAAATTATATTCACATCTCTTTTAAT ATATATTTATCTTTAAGCAATTTAATATGTATTCTACTAGATTTAGACGTGGGTTATCCTATGTTCCACG GCGTTATAATCCACGTAATTATGGTTTTAAACGTACATTCGTCGTTAAACGTGGTGATGCTAAACGACGT CAGACTCAAGTGAAGAAACTAACAGAAGATGTTAAAATGTCATCACAACGCATCCATGAAAATCAATATG GTCCAGAATTTGTCATGGCGCATAATACAGCAATATCTACATTCATCAATTATCCCCAACTGTGTAAGAC TCAGCCCAATCGTAGTAGGTCATATATTAAGTTAAAATCGTTACATTTTAAGGGAACCTTAAAGATCGAA CGTGTTGGGTCTGAGGTAAATATGGCTGGGTTAAATCCGAAGATTGAGGGTGTGTTTACTGTGGTTTTAG TTGTTGACCGTAAGCCACATTTGAATCCTACTGGTAACTTGCTACAGTTTGACGAGTTATTTGGTGCAAG AATTCACAGTCTAGGGAACTTAGCCGTTACCCCGGCGTTGAAAGAACGGTTCTACATACTGCATGTGTTG AAGCGAGTTATCTCCGTTGAGAAGGATAGTATGATGCTGGACCTAGAAGGATCCACTTGTCTCTCTAGTC GGCGTTATAATTGTTGGTCTACATTTAAGGACCTTGATCCTTCGTCATGTAACGGCGTCTATGATAATAT AAGCAAAAACGCCATATTAGTTTATTATTGTTGGATGTCGGATGCTATGTCTAAGGCATCCACATTTGTA TCATTTGATTTGGACTATTTTGGTTAAGAAATAATTGACTTGCGTAGTTTGCTCATATTTGTATTTTGTC ACAAAATAAAATATTATTATCTTAGCGACTTCGGTTGTGTCGGATTACAATTACTGTTAATACATTCATG GACCGTAGTCCTTACAAGCTCATTCAACTGGGCCAAGGACATAGTTATATTTGATTGAGAGCGTGTTAGA CCCACTTGTGATGCTGAATCACCTGGGTCCAAAACACTTCCGCCTAACTGATGAAGATCTTTATACGGAT GTAATGCGCTATGTCCTTGGTTGTCAGCATCTGTGTGAGTGGTTCCTATGGTGCTTCTACAAGCCCAGGA TTCACCTGGTTTTAATTCAATTGGGCCTGTAATGCCGAACCTTGACATGGATGCTGACCTCAATGGTTTT CTCTCCCACCTGCCGTAGTCCACATGTGTAAAGTCCACATCGTTATGGGTGAACTGTTTCGATAAAATCT TCACCGTCGGAGCCCGGAAAGGTATATCCACGGAGTGTTTAGCTGTGGACAACTTCAATTTCCCTTTGAA CTTGGCAAAATGGGTGTTCTGATGTACGTTAGTATCGGAGACTCTGTAATATAGCTTCCAGGGTATGGGG TCCTTCAAGGAGAAGAAGGATGCTGAGAAATAATGGAGATCGATGTTACATCTTAGTGGAAATGTCCAAG AAGCTTGTAATGATTCATTGTCTGTCATTCGTTTGTCATGGATTTCCACTATGACCGACCCAGTGGCGTT TATCGGAACTTGCTGTCTATACTCGATAACGCAATGGTCAATTTTCATACAGCTACGACTAAGTCTGGCA GCGTACTGCGACGCCGTTGACGGAAATTGAAGTATTATCTCCGTTAAGTCATGAGAGAGCTGATATTCAT CTCTATGTGACTCTATATAATTGAATGCGCTAGGAGGATTCGCCAACCATGAATCCATATATGAAAATTT GGCAGCGCACGTGAAGGCTTACGGAGTCTGAATCTGGTAATAAGAAGCTATACCTAACAATGTTAATGGT AATGAAAATGACAAATTACTATTTGCTGAAAGAGTTCAAAAATAAATGCTTACTTAGTTATTAAGATATT GCTATTAGCAGCAACAATATATGAGGAAACCGGTGAGGATGAAAGCAAAAGCGTCTTCAGAAGACAGAGC AGAAAGAATTGGTATGAATAATTAAATGAACAGGCAGTGTCGTTATATAGAAGATCATTGTGTTTTAGAG AGAGAAAATTTTGCAGTGGCATTTGTGTAATATGGAGGGGTACACCGATTGGAGCTCTTTAACCTGGGCT TATTGTATCGGTGTATTGGTAGCCAATATATAGTATATGGGAGTTATCTAGGATCTTCGTACACGTGGA
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: FASTA Splitter
by John M. Dlugosz (Monsignor) on Jun 01, 2009 at 19:38 UTC | |
|
Re: FASTA Splitter
by citromatik (Curate) on Jun 01, 2009 at 22:39 UTC | |
|
Re: FASTA Splitter
by lamprecht (Friar) on Jun 01, 2009 at 21:32 UTC | |
|
Re: FASTA Splitter
by perliff (Monk) on Jun 02, 2009 at 08:55 UTC |