in reply to Re^8: Best way to store/access large dataset?
in thread Best way to store/access large dataset?
If the result you want is along these lines
Attribute : 1 Category : Circle Rectangle Square Triangle Sum : 4 4 4 0 Count : 4 4 4 4 Percent : 100.00% 100.00% 100.00% 0.00% Attribute : 2 Category : Circle Rectangle Square Triangle Sum : 4 0 4 0 Count : 4 4 4 4 Percent : 100.00% 0.00% 100.00% 0.00% Attribute : 3 Category : Circle Rectangle Square Triangle Sum : 0 4 0 4 Count : 4 4 4 4 Percent : 0.00% 100.00% 0.00% 100.00%
then try this
poj#!/usr/bin/perl use strict; use warnings; #use Data::Dump 'pp'; my $t0 = time; # start # load categ look up my $fileID = 'Attributes_ID.txt'; open IN,'<',$fileID or die "$!"; my %id2categ = (); my $count = 0; while (<IN>){ chomp; next unless /^\d/; # skip junk #1.file.ext Square my ($id,$cat) = split /\s+/,$_; $id2categ{$id} = $cat; ++$count; } close IN; print "$fileID : $count records loaded\n"; #pp \%file2categ; # read header to get fileid for each column my $fileA = 'Attributes.txt'; open IN,'<',$fileA or die "$!"; chomp (my $line1 = <IN>); my @fileid = split /\s+/,$line1; # convert fileid to category my (undef,@col2categ) = map{ $id2categ{$_} }@fileid; #pp \@col2categ; # count no of cols for each categ once my %count=(); $count{$_} +=1 for @col2categ; #pp \%count; # process each attribute in turn my $PAGESIZE = 100_000 ; # show progress open OUT,'>','report.txt' or die "$!"; my $total = 0; $count = 0; while (<IN>){ chomp; next unless /^\d/; # skip junk my ($attr,@score) = split /\s+/,$_; # aggregate by category my %sum=(); for my $col (0..$#score){ my $categ = $col2categ[$col]; $sum{$categ} += $score[$col]; } #pp \%result; # calc pcent; my %pcent; my @category = sort keys %count; for (@category){ $pcent{$_} = sprintf "%9.2f%%",100*$sum{$_}/$count{$_} unless $cou +nt{$_}==0; $sum{$_} = sprintf "%10d",$sum{$_}; $count{$_} = sprintf "%10d",$count{$_}; } # output print OUT "\nAttribute : $attr\n"; print OUT join "","Category : ",map{ sprintf "%10s",$_} @category,"\ +n"; print OUT join "","Sum : ",@sum{@category},"\n"; print OUT join "","Count : ",@count{@category},"\n"; print OUT join "","Percent : ",@pcent{@category},"\n"; # progress monitor if (++$count >= $PAGESIZE ){ $total += $count; $count = 0; print "Processed $total records\n"; }; } close IN; $total += $count; my $dur = time-$t0; printf "%s records time = %s seconds\n",$total,$dur;
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^10: Best way to store/access large dataset?
by Speed_Freak (Sexton) on Jun 28, 2018 at 19:53 UTC | |
|
Re^10: Best way to store/access large dataset?
by Speed_Freak (Sexton) on Aug 02, 2018 at 14:57 UTC |