comment on

If the result you want is along these lines

Attribute : 1
Category :     Circle Rectangle    Square  Triangle         
Sum      :          4         4         4         0
Count    :          4         4         4         4
Percent  :    100.00%   100.00%   100.00%     0.00%

Attribute : 2
Category :     Circle Rectangle    Square  Triangle         
Sum      :          4         0         4         0
Count    :          4         4         4         4
Percent  :    100.00%     0.00%   100.00%     0.00%

Attribute : 3
Category :     Circle Rectangle    Square  Triangle         
Sum      :          0         4         0         4
Count    :          4         4         4         4
Percent  :      0.00%   100.00%     0.00%   100.00%

then try this

#!/usr/bin/perl
use strict;
use warnings;
#use Data::Dump 'pp';

my $t0 = time; # start

# load categ look up
my $fileID = 'Attributes_ID.txt'; 
open IN,'<',$fileID or die "$!";
my %id2categ = ();
my $count = 0;
while (<IN>){
  chomp;
  next unless /^\d/; # skip junk
  #1.file.ext    Square
  my ($id,$cat) = split /\s+/,$_;
  $id2categ{$id} = $cat;
  ++$count;
}
close IN;
print "$fileID : $count records loaded\n";
#pp \%file2categ;

# read header to get fileid for each column
my $fileA = 'Attributes.txt'; 
open IN,'<',$fileA or die "$!";
chomp (my $line1 = <IN>);
my @fileid = split /\s+/,$line1;

# convert fileid to category
my (undef,@col2categ) = map{ $id2categ{$_} }@fileid;
#pp \@col2categ;

# count no of cols for each categ once
my %count=();
$count{$_} +=1 for @col2categ;
#pp \%count;

# process each attribute in turn
my $PAGESIZE = 100_000 ; # show progress
open OUT,'>','report.txt' or die "$!";
my $total = 0;
$count = 0;
while (<IN>){
  chomp;
  next unless /^\d/; # skip junk

  my ($attr,@score) = split /\s+/,$_;
  
  # aggregate by category
  my %sum=();
  for my $col (0..$#score){
    my $categ = $col2categ[$col];
    $sum{$categ} += $score[$col];
  }
  #pp \%result;

  # calc pcent;
  my %pcent;
  my @category = sort keys %count;
  for (@category){
    $pcent{$_} = sprintf "%9.2f%%",100*$sum{$_}/$count{$_} unless $cou
+nt{$_}==0; 
    $sum{$_}   = sprintf "%10d",$sum{$_}; 
    $count{$_} = sprintf "%10d",$count{$_}; 
  }
  
  # output
  print OUT "\nAttribute : $attr\n";
  print OUT join "","Category : ",map{ sprintf "%10s",$_} @category,"\
+n";
  print OUT join "","Sum      : ",@sum{@category},"\n";
  print OUT join "","Count    : ",@count{@category},"\n";
  print OUT join "","Percent  : ",@pcent{@category},"\n";
  
  # progress monitor
  if (++$count >= $PAGESIZE ){
    $total += $count;
    $count = 0;
    print "Processed $total records\n";
  };
}
close IN;

$total += $count;
my $dur = time-$t0;
printf "%s records  time = %s seconds\n",$total,$dur;
[download]

poj

In reply to Re^9: Best way to store/access large dataset? by poj
in thread Best way to store/access large dataset? by Speed_Freak

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.