# llil.pl
# Example run: perl llil.pl tt1.txt tt2.txt >oo1.tmp
use strict;
use warnings;
# ----------------------------------------------------------------------
# LLiL specification
# ------------------
# A LLiL-format file is a text file.
# Each line consists of a lowercase name a TAB character and a non-negative integer count.
# That is, each line must match : ^[a-z]+\t\d+$
# For example, reading the LLiL-format files, tt1.txt containing:
# camel\t42
# pearl\t94
# dromedary\t69
# and tt2.txt containing:
# camel\t8
# hello\t12345
# dromedary\t1
# returns this hashref:
# $hash_ret{"camel"} = 50
# $hash_ret{"dromedary"} = 70
# $hash_ret{"hello"} = 12345
# $hash_ret{"pearl"} = 94
# That is, values are added for items with the same key.
#
# To get the required LLiL text, you must sort the returned hashref
# descending by value and insert a TAB separator:
# hello\t12345
# pearl\t94
# dromedary\t70
# camel\t50
# To make testing via diff easier, we further sort ascending by name
# for lines with the same value.
# ----------------------------------------------------------------------
# Function get_properties
# Read a list of LLiL-format files
# Return a reference to a hash of properties
sub get_properties
{
my $files = shift; # in: reference to a list of LLiL-format files
my %hash_ret; # out: reference to a hash of properties
for my $fname ( @{$files} ) {
open( my $fh, '<', $fname ) or die "error: open '$fname': $!";
while (<$fh>) {
chomp;
my ($word, $count) = split /\t/;
$hash_ret{$word} += $count;
}
close($fh) or die "error: close '$fname': $!";
}
return \%hash_ret;
}
# ----------------- mainline -------------------------------------------
@ARGV or die "usage: $0 file...\n";
my @llil_files = @ARGV;
warn "llil start\n";
my $tstart1 = time;
my $href = get_properties( \@llil_files );
my $tend1 = time;
my $taken1 = $tend1 - $tstart1;
warn "get_properties : $taken1 secs\n";
my $tstart2 = time;
for my $key ( sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href} ) {
print "$key\t$href->{$key}\n";
}
my $tend2 = time;
my $taken2 = $tend2 - $tstart2;
my $taken = $tend2 - $tstart1;
warn "sort + output : $taken2 secs\n";
warn "total : $taken secs\n";
####
sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href}
##
##
// llil.cpp. C++ 11 version of Perl llil.pl.
// g++ compile on Linux:
// g++ -o llil -std=c++11 -Wall -O3 llil.cpp
// This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64)
// that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe).
// Example run: llil tt1.txt tt2.txt >out.txt
// Uncomment next line to sort by creating a multimap (instead of via the sort function)
// #define LLIL_SORT_VIA_MULTIMAP_L 1
#include
#include
#include
#include
#include
#include
#include
#include
#include
##
# gen-llil.pl
# Crude program to generate a big LLiL test file to use in benchmarks
# perl gen-llil.pl big2.txt 200 3 - produces a test file with size = 35,152,000 bytes
use strict;
use warnings;
use autodie;
{
my $ordmin = ord('a');
my $ordmax = ord('z') + 1;
# Generate a random word
sub gen_random_word {
my $word = shift; # word prefix
my $nchar = shift; # the number of random chars to append
for my $i (1 .. $nchar) {
$word .= chr( $ordmin + int( rand($ordmax - $ordmin) ) );
}
return $word;
}
}
sub create_test_file {
my $fname = shift;
my $count = shift;
my $wordlen = shift;
open( my $fh_out, '>', $fname );
for my $c ( 'aaa' .. 'zzz' ) {
for my $i (1 .. $count) {
print {$fh_out} gen_random_word( $c, $wordlen ) . "\t" . 1 . "\n";
}
}
}
my $outfile = shift;
my $count = shift;
my $wordlen = shift;
$outfile or die "usage: $0 outfile count wordlen\n";
$count or die "usage: $0 outfile count wordlen\n";
print "generating test file '$outfile' with count '$count'\n";
create_test_file($outfile, $count, $wordlen);
print "file size=", -s $outfile, "\n";
##
##
perl gen-llil.pl big1.txt 200 3
perl gen-llil.pl big2.txt 200 3
perl gen-llil.pl big3.txt 200 3
##
##
perl llil.pl big1.txt big2.txt big3.txt >perl.tmp
##
##
llil big1.txt big2.txt big3.txt >cpp.tmp
##
##
diff perl.tmp cpp.tmp
##
##
> perl llil.pl big1.txt big2.txt big3.txt >perl.tmp
llil start
get_properties : 11 secs
sort + output : 74 secs
total : 85 secs
##
##
> llil big1.txt big2.txt big3.txt >cpp.tmp
llil start (sort version)
get_properties : 9 secs
sort + output : 7 secs
total : 16 secs
##
##
> diff cpp.tmp perl.tmp