# llil2grt.pl. Try a GRT version.
# Example run: perl llil2grt.pl tt1.txt tt2.txt tt3.txt >out.txt
use strict;
use warnings;
use feature qw{say};
# ----------------------------------------------------------------------
# LLiL specification
# ------------------
# A LLiL-format file is a text file.
# Each line consists of a lowercase name a TAB character and a non-negative integer count.
# That is, each line must match : ^[a-z]+\t\d+$
# For example, reading the LLiL-format files, tt1.txt containing:
# camel\t42
# pearl\t94
# dromedary\t69
# and tt2.txt containing:
# camel\t8
# hello\t12345
# dromedary\t1
# returns this hashref:
# $hash_ret{"camel"} = 50
# $hash_ret{"dromedary"} = 70
# $hash_ret{"hello"} = 12345
# $hash_ret{"pearl"} = 94
# That is, values are added for items with the same key.
#
# To get the required LLiL text, you must sort the returned hashref
# descending by value and insert a TAB separator:
# hello\t12345
# pearl\t94
# dromedary\t70
# camel\t50
# To make testing via diff easier, we further sort ascending by name
# for lines with the same value.
# ----------------------------------------------------------------------
# Function get_properties
# Read a list of LLiL-format files
# Return a reference to a hash of properties
sub get_properties
{
my $files = shift; # in: reference to a list of LLiL-format files
my %hash_ret; # out: reference to a hash of properties
for my $fname ( @{$files} ) {
open( my $fh, '<', $fname ) or die "error: open '$fname': $!";
while (<$fh>) {
chomp;
my ($word, $count) = split /\t/;
$hash_ret{$word} += $count;
}
close($fh) or die "error: close '$fname': $!";
}
return \%hash_ret;
}
# ----------------- mainline -------------------------------------------
@ARGV or die "usage: $0 file...\n";
my @llil_files = @ARGV;
warn "llil2grt start\n";
my $tstart1 = time;
my $href = get_properties( \@llil_files );
my $tend1 = time;
my $taken1 = $tend1 - $tstart1;
warn "get_properties : $taken1 secs\n";
my $tstart2 = time;
my @lines;
while (my ($k, $v) = each %{$href}) { push @lines, pack('NA*', -$v, "$k\t$v") }
say substr($_, 4) for sort @lines;
my $tend2 = time;
my $taken2 = $tend2 - $tstart2;
my $taken = $tend2 - $tstart1;
warn "sort + output : $taken2 secs\n";
warn "total : $taken secs\n";
####
> perl llil2grt.pl big1.txt big2.txt big3.txt >perl2grt.tmp
####
llil2grt start
get_properties : 10 secs
sort + output : 20 secs
total : 30 secs
Memory use (Windows Private Bytes): 2,657,968K
####
> diff perl2d.tmp perl2grt.tmp
####
my @data;
while ( my ($k, $v) = each %{$href} ) { push @data, dualvar($v, $k) }
for my $key ( sort { $b <=> $a } sort @data ) {
say "$key\t" . (0 + $key);
}
####
my @lines;
while (my ($k, $v) = each %{$href}) { push @lines, pack('NA*', -$v, "$k\t$v") }
say substr($_, 4) for sort @lines;