Here's one possible approach to handling your data. Some notes and caveats:

use 5.010; # need \K use warnings; use strict; use autodie; use IO::Handle; use List::Util qw(reduce); use Data::Dump qw(dd pp); use Test::More # tests => ?? + 1 # Test::NoWarnings adds 1 test 'no_plan' ; use Test::NoWarnings; use constant EXPECTED_OUTPUT => <<EOT; ENSG00000113916___1|1x3\tAAGAATGTCA ENSG00000113916___1|2x3\tAGAAAGAATA ENSG00000113916___1|3x3\tACTAGAAAGA EOT use constant DEBUG => 0; use constant DB_PRINT => 1 && DEBUG; use constant { # debug print points DBP_MAIN_1 => 1 && DB_PRINT, DBP_MAIN_2 => 1 && DB_PRINT, DBP_MAIN_3 => 1 && DB_PRINT, }; use constant REC_START => 'TF'; use constant SET_SIZE => 4; # whitespace separators are tab characters # e.g., 'Motif ENSG00000113916___1|2x3' use constant MOTIF => qr{ ^ Motif \s+ \K \S+ $ }xms; # e.g., 'Pos A C G T' use constant BASES => qr{ (?: \G | ^ Pos) \s+ \K [ACGT] \b }xms; # e.g., '1 0.538498 0.157305 0.157633 0.146564' use constant VALUES => qr{ (?: \G | ^ \d+) \s+ \K 0 [.] \d+ \b }xms; MAIN: { # new scheme??? # multi-line records with no unambiguous record delimiter: use # a jury-rig. # multiple spaces instead of tab as field separator. (new?) IO::Handle->input_record_separator(REC_START . qq{ Unknown\n}); <DATA>; # throw away 1st junk 'record'. my $output; RECORD: while (my $record = <DATA>) { CURRENT_RECORD: for ($record) { print qq{[[$_]] \n} if DBP_MAIN_1; my ($motif) = m{ ${ \ MOTIF } }xmsog; my @base_ord = m{ ${ \ BASES } }xmsog; my @base_values = m{ ${ \ VALUES } }xmsog; print qq{motif '$motif' \n} if DBP_MAIN_2; DBP_MAIN_2 && print qq{'$_' } for @base_ord, qq{\n}; DBP_MAIN_2 && print qq{'$_' } for @base_values, qq{\n}; @base_ord == SET_SIZE or die qq{wrong base order: (@base_ord) }; @base_values % SET_SIZE == 0 or die 'values not multiple of 4'; my $max_bases; MAX_BASES: for (my $i = 0; $i < @base_values; ) { map $max_bases .= $_->[1], # base of max reduce { $a->[0] > $b->[0] ? $a : $b } # max in group map [ $base_values[$i++], $_ ], @base_ord # groups of 4 ; } my $output_record = qq{$motif\t$max_bases\n}; print qq{new record '$output_record' \n} if DBP_MAIN_3; $output .= $output_record; } # end for CURRENT_RECORD } # end while RECORD ok $output eq EXPECTED_OUTPUT, qq{expected output}; exit(0); # expected MAIN exit } # end MAIN block die qq{unexpected MAIN exit}; # NOTE: removed extraneous (?) three lines from example data set # posted with pm 1079539 (or thereabouts) __DATA__ TF Unknown TF Name Unknown Gene ENSG00000113916 Motif ENSG00000113916___1|1x3 Family C2H2 ZF Species Homo_sapiens Pos A C G T 1 0.664794 0.13099 0.0810125 0.123203 2 0.675621 0.0396475 0.144967 0.139764 3 0.0913393 0.0396819 0.847004 0.0219745 4 0.850414 0.0522149 0.0519174 0.0454536 5 0.89157 0.00962148 0.0845269 0.0142814 6 0.122389 0.0875591 0.0734604 0.716591 7 0.226696 0.00745549 0.745549 0.0202999 8 0.156228 0.151994 0.128767 0.563011 9 0.22083 0.561173 0.12007 0.0979266 10 0.507656 0.0711684 0.0652815 0.355894 TF Unknown TF Name Unknown Gene ENSG00000113916 Motif ENSG00000113916___1|2x3 Family C2H2 ZF Species Homo_sapiens Pos A C G T 1 0.538498 0.157305 0.157633 0.146564 2 0.0728444 0.00877167 0.877166 0.0412175 3 0.959269 0.0131077 0.0159611 0.0116621 4 0.852439 0.0238831 0.0168134 0.106864 5 0.57332 0.0688014 0.181385 0.176494 6 0.139513 0.0747988 0.737607 0.0480813 7 0.735484 0.0912993 0.09091 0.0823067 8 0.79932 0.0270417 0.137306 0.0363319 9 0.16103 0.12536 0.109938 0.603672 10 0.622356 0.06782 0.115463 0.194361 TF Unknown TF Name Unknown Gene ENSG00000113916 Motif ENSG00000113916___1|3x3 Family C2H2 ZF Species Homo_sapiens Pos A C G T 1 0.616484 0.0886488 0.24602 0.0488468 2 0.0971289 0.591289 0.134781 0.176801 3 0.0715039 0.0237142 0.0432674 0.861514 4 0.73769 0.117011 0.059703 0.0855963 5 0.0728444 0.00877167 0.877166 0.0412175 6 0.959269 0.0131077 0.0159611 0.0116621 7 0.852439 0.0238831 0.0168134 0.106864 8 0.57332 0.0688014 0.181385 0.176494 9 0.139513 0.0747988 0.737607 0.0480813 10 0.615257 0.189034 0.125514 0.0701943

In reply to Re: First foray into Perl by AnomalousMonk
in thread First foray into Perl by LostWeekender

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post, it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.