in reply to Re: working with files
in thread working with files

sorry - i wasn't clear - I need to group on A and D and if the other things B,C etc are the same - then i would say the actual value or else say B:MULTIPLE C MULTIPLE etc.STR is the start sequence and END is the end Sequence number and the other is just a count - that sums up how many of each exist.

Thanks

Replies are listed 'Best First'.
Re^3: working with files
by almut (Canon) on Apr 12, 2010 at 17:26 UTC

    From your sample data/output it's not quite clear to me what you mean by "group on A and D"... but here's something that groups by A alone:

    #!/usr/bin/perl my %groups; while (<DATA>) { my ($grp, @other) = split ' '; for my $field (@other) { my ($name, $val) = split /:/, $field; if ($name eq "STR") { $groups{$grp}{$name} = $val unless defined $groups{$grp}{$ +name}; # first } elsif ($name eq "END") { $groups{$grp}{$name} = $val; # last } elsif ($name eq "CNT") { $groups{$grp}{$name} += $val; # sum } else { $groups{$grp}{$name}{$val}++; # unique values } } } #use Data::Dumper; # debug #print Dumper \%groups; for my $grp (sort keys %groups) { my $fields = $groups{$grp}; print $grp; for my $name (qw(B C D)) { my @elems = keys %{ $fields->{$name} }; my $val = @elems > 1 ? "MULTIPLE" : $elems[0]; print " $name:$val"; } for my $name (qw(CNT STR END)) { print " $name:$fields->{$name}"; } print "\n"; } __DATA__ A:1 B:2 C:3 D:4 CNT:1 STR:1 END:2 A:1 B:7 C:3 D:4 CNT:1 STR:2 END:3 A:1 B:2 C:3 D:4 CNT:1 STR:3 END:4 A:2 B:2 C:3 D:5 CNT:1 STR:4 END:5 A:2 B:2 C:3 D:5 CNT:5 STR:5 END:10 A:3 B:2 C:3 D:4 CNT:1 STR:11 END:12

    Output:

    A:1 B:MULTIPLE C:3 D:4 CNT:3 STR:1 END:4 A:2 B:2 C:3 D:5 CNT:6 STR:4 END:10 A:3 B:2 C:3 D:4 CNT:1 STR:11 END:12

    (STR and END are assumed to be sorted on input — in case they aren't, you'd need to compute the minimum and maximum value instead of taking the first and the last...)

Re^3: working with files
by choroba (Cardinal) on Apr 12, 2010 at 17:09 UTC
    Something like this?
    #!/usr/bin/perl use warnings; use strict; my %hash; while(defined(my $line = <>)){ if($line =~ /A:([0-9]+) +B:([0-9]+) +C:([0-9]+) +D:([0-9]+) +CNT:([0 +-9]+) +STR:([0-9]+) +END:([0-9]+)/){ my($a,$b,$c,$d,$cnt,$str,$end) = ($1,$2,$3,$4,$5,$6,$7); my $key = "a:$a,d:$d"; if(exists $hash{$key}){ # B and C will be reported if multiple my $current = $hash{$key}; if($current->{b} ne $b){ $current->{b} = 'MULTIPLE'; } if($current->{c} ne $c){ $current->{c} = 'MULTIPLE'; } # STR is start, we need the least if($str < $current->{str}){ $current->{str} = $str; } # END is end, we need the greatest if($end > $current->{end}){ $current->{end} = $end; } # add to count $current->{cnt} += $cnt; }else{ # new key $hash{$key}{b} = $b; $hash{$key}{c} = $c; $hash{$key}{str} = $str; $hash{$key}{end} = $end; $hash{$key}{cnt} = $cnt; } } } # print the results foreach my $key (keys %hash){ $key =~ /a:(.*),d:(.*)/; my $current = $hash{$key}; my ($a,$d) = ($1,$2); print "A:$a "; print "B:$current->{b} "; print "C:$current->{c} "; print "D:$d "; print "CNT:$current->{cnt} "; print "STR:$current->{str} "; print "END:$current->{end}\n"; }