# llil.pl # Example run: perl llil.pl tt1.txt tt2.txt >oo1.tmp use strict; use warnings; # ---------------------------------------------------------------------- # LLiL specification # ------------------ # A LLiL-format file is a text file. # Each line consists of a lowercase name a TAB character and a non-negative integer count. # That is, each line must match : ^[a-z]+\t\d+$ # For example, reading the LLiL-format files, tt1.txt containing: # camel\t42 # pearl\t94 # dromedary\t69 # and tt2.txt containing: # camel\t8 # hello\t12345 # dromedary\t1 # returns this hashref: # $hash_ret{"camel"} = 50 # $hash_ret{"dromedary"} = 70 # $hash_ret{"hello"} = 12345 # $hash_ret{"pearl"} = 94 # That is, values are added for items with the same key. # # To get the required LLiL text, you must sort the returned hashref # descending by value and insert a TAB separator: # hello\t12345 # pearl\t94 # dromedary\t70 # camel\t50 # To make testing via diff easier, we further sort ascending by name # for lines with the same value. # ---------------------------------------------------------------------- # Function get_properties # Read a list of LLiL-format files # Return a reference to a hash of properties sub get_properties { my $files = shift; # in: reference to a list of LLiL-format files my %hash_ret; # out: reference to a hash of properties for my $fname ( @{$files} ) { open( my $fh, '<', $fname ) or die "error: open '$fname': $!"; while (<$fh>) { chomp; my ($word, $count) = split /\t/; $hash_ret{$word} += $count; } close($fh) or die "error: close '$fname': $!"; } return \%hash_ret; } # ----------------- mainline ------------------------------------------- @ARGV or die "usage: $0 file...\n"; my @llil_files = @ARGV; warn "llil start\n"; my $tstart1 = time; my $href = get_properties( \@llil_files ); my $tend1 = time; my $taken1 = $tend1 - $tstart1; warn "get_properties : $taken1 secs\n"; my $tstart2 = time; for my $key ( sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href} ) { print "$key\t$href->{$key}\n"; } my $tend2 = time; my $taken2 = $tend2 - $tstart2; my $taken = $tend2 - $tstart1; warn "sort + output : $taken2 secs\n"; warn "total : $taken secs\n"; ##

##

sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href}

##

##

// llil.cpp. C++ 11 version of Perl llil.pl.
// g++ compile on Linux:
//   g++ -o llil -std=c++11 -Wall -O3 llil.cpp
// This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64)
// that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe).
// Example run: llil tt1.txt tt2.txt >out.txt

// Uncomment next line to sort by creating a multimap (instead of via the sort function)
// #define LLIL_SORT_VIA_MULTIMAP_L 1

#include 
#include 
#include 
#include 
#include 

#include 
#include 
#include 
#include 
#include 

#include 
#include 

#include 
#include 
#include 

// ------------------------------------------------------------

// Performance hacks to speed up IO.
// See https://www.reddit.com/r/rust/comments/9xedap/how_to_achieve_fast_stdinstdout_io_suitable_for/
// Avoid flush by using "\n" instead of std::endl (this made a big difference in this program!)
// This one made almost no difference in this program:
//   const auto io_speed_up =[](){
//      std::ios::sync_with_stdio(false);
//      std::cin.tie(nullptr);
//      return nullptr;
//   }();

// ------------------------------------------------------------

using str_int_type     = std::pair;
using map_str_int_type = std::unordered_map;
using vec_str_int_type = std::vector;

// Mimic the Perl get_properties subroutine
static void get_properties(
   int                nfiles,     //  in: the number of input files
   char*              fname[],    //  in: the input file names
   map_str_int_type&  hash_ret)   // out: a hash of properties
{
   for (int i = 0; i < nfiles; ++i) {
      std::ifstream llfile(fname[i]);
      if (!llfile) {
         std::cerr << "Error opening '" << fname[i] << "'\n";
         return;
      }
      for (std::string line; std::getline(llfile, line); ) {
         std::string word, count;
         std::stringstream ssline(line);
         std::getline(ssline, word, '\t');
         std::getline(ssline, count);
         hash_ret[word] += std::stoi(count);
      }
   }
}

#ifdef LLIL_SORT_VIA_MULTIMAP_L

// Convert a std::unordered_map to a std::multimap
static std::multimap invert_map(const std::unordered_map& m)
{
   std::multimap mm;
   for (std::unordered_map::const_iterator it = m.begin(); it != m.end(); ++it) {
      mm.insert( std::make_pair(it->second, it->first) );
   }
   return mm;
}

#endif

int main(int argc, char* argv[])
{
   if (argc < 2) {
      std::cerr << "usage: llil file1 file2 ... >out.txt\n";
      return 1;
   }

#ifdef LLIL_SORT_VIA_MULTIMAP_L
   std::cerr << "llil start (multimap version)\n";
#else
   std::cerr << "llil start (sort version)\n";
#endif
   time_t tstart1 = ::time(NULL);

   // Create the hash of properties
   map_str_int_type hash_ret;
   get_properties(argc - 1, &argv[1], hash_ret);
   time_t tend1 = ::time(NULL);
   long taken1  = static_cast(::difftime(tend1, tstart1) + 0.5);
   std::cerr << "get_properties : " << taken1 << " secs\n";

   // Sort descending by value, i.e. mimic this Perl code in C++:
   //   sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href}

   time_t tstart2 = ::time(NULL);

#ifdef LLIL_SORT_VIA_MULTIMAP_L

   std::multimap newmap = invert_map(hash_ret);
   for (std::multimap::reverse_iterator it = newmap.rbegin(); it != newmap.rend(); ++it) {
      std::cout << it->second << '\t' << it->first << "\n"; 
   }

#else

   vec_str_int_type v( hash_ret.begin(), hash_ret.end() );
   std::sort( v.begin(), v.end(),
      [](const str_int_type& left, const str_int_type& right) { return right.second != left.second ? right.second < left.second : left.first < right.first; }
   );
   for ( const std::pair& n : v ) {
       std::cout << n.first << '\t' << n.second << "\n";
   }

#endif

   time_t tend2 = ::time(NULL);
   long taken2  = static_cast(::difftime(tend2, tstart2) + 0.5);
   long taken   = static_cast(::difftime(tend2, tstart1) + 0.5);
   std::cerr << "sort + output  : " << taken2 << " secs\n";
   std::cerr << "total          : " << taken << " secs\n";

   return 0;
}

##

##

# gen-llil.pl
# Crude program to generate a big LLiL test file to use in benchmarks
# perl gen-llil.pl big2.txt 200 3 - produces a test file with size = 35,152,000 bytes

use strict;
use warnings;
use autodie;

{
   my $ordmin = ord('a');
   my $ordmax = ord('z') + 1;

   # Generate a random word
   sub gen_random_word {
      my $word  = shift;    # word prefix
      my $nchar = shift;    # the number of random chars to append
      for my $i (1 .. $nchar) {
         $word .= chr( $ordmin + int( rand($ordmax - $ordmin) ) );
      }
      return $word;
   }
}

sub create_test_file {
   my $fname   = shift;
   my $count   = shift;
   my $wordlen = shift;
   open( my $fh_out, '>', $fname );
   for my $c ( 'aaa' .. 'zzz' ) {
      for my $i (1 .. $count) {
         print {$fh_out} gen_random_word( $c, $wordlen ) . "\t" . 1 . "\n";
      }
   }
}

my $outfile = shift;
my $count   = shift;
my $wordlen = shift;
$outfile or die "usage: $0 outfile count wordlen\n";
$count or die "usage: $0 outfile count wordlen\n";
print "generating test file '$outfile' with count '$count'\n";
create_test_file($outfile, $count, $wordlen);
print "file size=", -s $outfile, "\n";

##

##

perl gen-llil.pl big1.txt 200 3
perl gen-llil.pl big2.txt 200 3
perl gen-llil.pl big3.txt 200 3

##

##

perl llil.pl big1.txt big2.txt big3.txt >perl.tmp

##

##

llil big1.txt big2.txt big3.txt >cpp.tmp

##

##

diff perl.tmp cpp.tmp

##

##

> perl llil.pl big1.txt big2.txt big3.txt >perl.tmp
llil start
get_properties : 11 secs
sort + output  : 74 secs
total          : 85 secs

##

##

> llil big1.txt big2.txt big3.txt >cpp.tmp
llil start (sort version)
get_properties : 9 secs
sort + output  : 7 secs
total          : 16 secs

##

##

> diff cpp.tmp perl.tmp