in reply to Re^7: First foray into Perl
in thread First foray into Perl
Here's a version for double-newline record separators, still no tabs, only 1+ spaces separate fields. Included new ENSG00000113916___1|4x3 motif in test data. Most of notes and caveats of Re: First foray into Perl still apply.
# NOTE: still no tabs; spaces only separate fields. # double-newline record delimiter. use 5.010; # need \K use warnings; use strict; use autodie; use IO::Handle; use List::Util qw(reduce); use Data::Dump qw(dd pp); use Test::More # tests => ?? + 1 # Test::NoWarnings adds 1 test 'no_plan' ; use Test::NoWarnings; use constant EXPECTED_OUTPUT => <<EOT; ENSG00000113916___1|4x3\tAGTACTAGAA ENSG00000113916___1|1x3\tAAGAATGTCA ENSG00000113916___1|2x3\tAGAAAGAATA ENSG00000113916___1|3x3\tACTAGAAAGA EOT use constant DEBUG => 0; use constant DB_PRINT => 1 && DEBUG; use constant { # debug print points DBP_MAIN_1 => 1 && DB_PRINT, DBP_MAIN_2 => 1 && DB_PRINT, DBP_MAIN_3 => 1 && DB_PRINT, }; use constant SET_SIZE => 4; # number of bases in each freq. (?) set # whitespace separators are tab characters # e.g., 'Motif ENSG00000113916___1|2x3' use constant MOTIF => qr{ ^ Motif \s+ \K \S+ $ }xms; # e.g., 'Pos A C G T' use constant BASES => qr{ (?: \G | ^ Pos) \s+ \K [ACGT] \b }xms; # e.g., '1 0.538498 0.157305 0.157633 0.146564' use constant VALUES => qr{ (?: \G | ^ \d+) \s+ \K 0 [.] \d+ \b }xms; MAIN: { # new scheme # multi-line records with double-newline delimiter. # multiple spaces instead of tab as field separator. (still) IO::Handle->input_record_separator(qq{\n\n}); my $output; RECORD: while (my $record = <DATA>) { CURRENT_RECORD: for ($record) { print qq{[[$_]] \n} if DBP_MAIN_1; my ($motif) = m{ ${ \ MOTIF } }xmsog; my @base_ord = m{ ${ \ BASES } }xmsog; my @base_values = m{ ${ \ VALUES } }xmsog; print qq{motif '$motif' \n} if DBP_MAIN_2; DBP_MAIN_2 && print qq{'$_' } for @base_ord, qq{\n}; DBP_MAIN_2 && print qq{'$_' } for @base_values, qq{\n}; @base_ord == SET_SIZE or die qq{wrong number of bases: }, qq{(@base_ord), expected }, SET_SIZE, ; @base_values % @base_ord == 0 or die 'values not multiple of number of bases'; my $max_bases; MAX_BASES: for (my $i = 0; $i < @base_values; ) { map $max_bases .= $_->[1], # base of max reduce { $a->[0] > $b->[0] ? $a : $b } # max in group map [ $base_values[$i++], $_ ], @base_ord # groups of 4 ; } my $output_record = qq{$motif\t$max_bases\n}; print qq{new record '$output_record' \n} if DBP_MAIN_3; $output .= $output_record; } # end for CURRENT_RECORD } # end while RECORD ok $output eq EXPECTED_OUTPUT, qq{expected output}; exit(0); # expected MAIN exit } # end MAIN block die qq{unexpected MAIN exit}; # NOTE: removed extraneous three lines from end of # example data posted with pm 1079705 __DATA__ TF Unknown TF Name Unknown Gene ENSG00000113916 Motif ENSG00000113916___1|4x3 Family C2H2 ZF Species Homo_sapiens Pos A C G T 1 0.427379 0.0647991 0.288826 0.218996 2 0.201974 0.139791 0.35254 0.305695 3 0.11714 0.118042 0.143884 0.620934 4 0.637331 0.0996546 0.228428 0.0345867 5 0.0971289 0.591289 0.134781 0.176801 6 0.0715039 0.0237142 0.0432674 0.861514 7 0.73769 0.117011 0.059703 0.0855963 8 0.0728444 0.00877167 0.877166 0.0412175 9 0.959269 0.0131077 0.0159611 0.0116621 10 0.612865 0.057845 0.0583267 0.270963 TF Unknown TF Name Unknown Gene ENSG00000113916 Motif ENSG00000113916___1|1x3 Family C2H2 ZF Species Homo_sapiens Pos A C G T 1 0.664794 0.13099 0.0810125 0.123203 2 0.675621 0.0396475 0.144967 0.139764 3 0.0913393 0.0396819 0.847004 0.0219745 4 0.850414 0.0522149 0.0519174 0.0454536 5 0.89157 0.00962148 0.0845269 0.0142814 6 0.122389 0.0875591 0.0734604 0.716591 7 0.226696 0.00745549 0.745549 0.0202999 8 0.156228 0.151994 0.128767 0.563011 9 0.22083 0.561173 0.12007 0.0979266 10 0.507656 0.0711684 0.0652815 0.355894 TF Unknown TF Name Unknown Gene ENSG00000113916 Motif ENSG00000113916___1|2x3 Family C2H2 ZF Species Homo_sapiens Pos A C G T 1 0.538498 0.157305 0.157633 0.146564 2 0.0728444 0.00877167 0.877166 0.0412175 3 0.959269 0.0131077 0.0159611 0.0116621 4 0.852439 0.0238831 0.0168134 0.106864 5 0.57332 0.0688014 0.181385 0.176494 6 0.139513 0.0747988 0.737607 0.0480813 7 0.735484 0.0912993 0.09091 0.0823067 8 0.79932 0.0270417 0.137306 0.0363319 9 0.16103 0.12536 0.109938 0.603672 10 0.622356 0.06782 0.115463 0.194361 TF Unknown TF Name Unknown Gene ENSG00000113916 Motif ENSG00000113916___1|3x3 Family C2H2 ZF Species Homo_sapiens Pos A C G T 1 0.616484 0.0886488 0.24602 0.0488468 2 0.0971289 0.591289 0.134781 0.176801 3 0.0715039 0.0237142 0.0432674 0.861514 4 0.73769 0.117011 0.059703 0.0855963 5 0.0728444 0.00877167 0.877166 0.0412175 6 0.959269 0.0131077 0.0159611 0.0116621 7 0.852439 0.0238831 0.0168134 0.106864 8 0.57332 0.0688014 0.181385 0.176494 9 0.139513 0.0747988 0.737607 0.0480813 10 0.615257 0.189034 0.125514 0.0701943
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^9: First foray into Perl
by LostWeekender (Novice) on Mar 26, 2014 at 20:30 UTC | |
by AnomalousMonk (Archbishop) on Mar 27, 2014 at 17:36 UTC | |
by Cristoforo (Curate) on Mar 27, 2014 at 22:45 UTC |