use strict; use warnings; use Data::Dumper; my $rawData = <<END_OF_DATA; species|Caragana arborescens | genus|Caragana | subfamily|Papilionoideae | family|Fabaceae | order|Fabales | no rank|eurosids I | subclass|Rosidae | no rank|core eudicots | no rank|eudicotyledons | no rank|Magnoliophyta | no rank|Spermatophyta | no rank|Euphyllophyta | no rank|Tracheophyta | phylum|Embryophyta | no rank|Charophyta/Embryophyta group | no rank|Streptophyta | kingdom|Viridiplantae | superkingdom|Eukaryota | no rank|cellular organisms | no rank|root || species|syncytium endosymbiont of Diaphorina citri | no rank|unclassified beta proteobacteria (miscellaneous) | no rank|unclassified beta proteobacteria | class|beta subdivision | phylum|Proteobacteria | superkingdom|Bacteria | no rank|cellular organisms | no rank|root || subspecies|Trypanosoma brucei brucei | species|Trypanosoma brucei | subgenus|Trypanozoon | genus|Trypanosoma | family|Trypanosomatidae | order|Kinetoplastida | no rank|Euglenozoa | superkingdom|Eukaryota | no rank|cellular organisms | no rank|root || species|unculturable Mariana archaeon no. 1 | no rank|environmental samples | no rank|unclassified Crenarchaeota | kingdom|Crenarchaeota | superkingdom|Archaea | no rank|cellular organisms | no rank|root || species|Suillus aeruginascens | genus|Suillus | family|Boletaceae | order|Boletales | subclass|Hymenomycetidae | class|Hymenomycetes | phylum|Basid}; END_OF_DATA $rawData =~ s{\n}{}g; my @rawRecords = split m{\s*\|\|\s*}, $rawData; print scalar @rawRecords, qq{ records found\n}; my %parsedRecords; foreach my $rawRecord ( @rawRecords ) { my ( $species ) = $rawRecord =~ m{(?<!\w)species\|(.+?)(?=\s+\|)}; print qq{Species: $species\n}; my @dataPairs = split m{\s+\|\s+}, $rawRecord; foreach my $dataPair ( @dataPairs ) { my ( $key, $value ) = split m{\|}, $dataPair; unless ( exists $parsedRecords{ $species }->{ $key } ) { $parsedRecords{ $species }->{ $key } = $value; } elsif ( ref $parsedRecords{ $species }->{ $key } eq q{ARRAY} ) { push @{ $parsedRecords{ $species }->{ $key } }, $value; } else { $parsedRecords{ $species }->{ $key } = [ $parsedRecords{ $species }->{ $key }, $value ]; } } } my $dd = Data::Dumper->new( [ \ %parsedRecords ], [ q{*parsedRecords} ] ); $dd->Indent( 1 ); print $dd->Dumpxs;
Here's the output.
5 records found Species: Caragana arborescens Species: syncytium endosymbiont of Diaphorina citri Species: Trypanosoma brucei Species: unculturable Mariana archaeon no. 1 Species: Suillus aeruginascens %parsedRecords = ( 'Trypanosoma brucei' => { 'genus' => 'Trypanosoma', 'species' => 'Trypanosoma brucei', 'superkingdom' => 'Eukaryota', 'subgenus' => 'Trypanozoon', 'order' => 'Kinetoplastida', 'subspecies' => 'Trypanosoma brucei brucei', 'family' => 'Trypanosomatidae', 'no rank' => [ 'Euglenozoa', 'cellular organisms', 'root' ] }, 'Caragana arborescens' => { 'kingdom' => 'Viridiplantae', 'genus' => 'Caragana', 'species' => 'Caragana arborescens', 'superkingdom' => 'Eukaryota', 'subfamily' => 'Papilionoideae', 'order' => 'Fabales', 'subclass' => 'Rosidae', 'phylum' => 'Embryophyta', 'family' => 'Fabaceae', 'no rank' => [ 'eurosids I', 'core eudicots', 'eudicotyledons', 'Magnoliophyta', 'Spermatophyta', 'Euphyllophyta', 'Tracheophyta', 'Charophyta/Embryophyta group', 'Streptophyta', 'cellular organisms', 'root' ] }, 'unculturable Mariana archaeon no. 1' => { 'kingdom' => 'Crenarchaeota', 'species' => 'unculturable Mariana archaeon no. 1', 'superkingdom' => 'Archaea', 'no rank' => [ 'environmental samples', 'unclassified Crenarchaeota', 'cellular organisms', 'root' ] }, 'syncytium endosymbiont of Diaphorina citri' => { 'phylum' => 'Proteobacteria', 'class' => 'beta subdivision', 'species' => 'syncytium endosymbiont of Diaphorina citri', 'superkingdom' => 'Bacteria', 'no rank' => [ 'unclassified beta proteobacteria (miscellaneous)', 'unclassified beta proteobacteria', 'cellular organisms', 'root' ] }, 'Suillus aeruginascens' => { 'subclass' => 'Hymenomycetidae', 'order' => 'Boletales', 'genus' => 'Suillus', 'phylum' => 'Basid};', 'class' => 'Hymenomycetes', 'species' => 'Suillus aeruginascens', 'family' => 'Boletaceae' } );
I hope this is of use.
Cheers,
JohnGG
In reply to Re: capturing separately
by johngg
in thread capturing separately
by ada
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |