# llil2.pl # Example run: perl llil2.pl tt1.txt tt2.txt tt3.txt >out.txt use strict; use warnings; # ---------------------------------------------------------------------- # LLiL specification # ------------------ # A LLiL-format file is a text file. # Each line consists of a lowercase name a TAB character and a non-negative integer count. # That is, each line must match : ^[a-z]+\t\d+$ # For example, reading the LLiL-format files, tt1.txt containing: # camel\t42 # pearl\t94 # dromedary\t69 # and tt2.txt containing: # camel\t8 # hello\t12345 # dromedary\t1 # returns this hashref: # $hash_ret{"camel"} = 50 # $hash_ret{"dromedary"} = 70 # $hash_ret{"hello"} = 12345 # $hash_ret{"pearl"} = 94 # That is, values are added for items with the same key. # # To get the required LLiL text, you must sort the returned hashref # descending by value and insert a TAB separator: # hello\t12345 # pearl\t94 # dromedary\t70 # camel\t50 # To make testing via diff easier, we further sort ascending by name # for lines with the same value. # ---------------------------------------------------------------------- # Function get_properties # Read a list of LLiL-format files # Return a reference to a hash of properties sub get_properties { my $files = shift; # in: reference to a list of LLiL-format files my %hash_ret; # out: reference to a hash of properties for my $fname ( @{$files} ) { open( my $fh, '<', $fname ) or die "error: open '$fname': $!"; while (<$fh>) { chomp; my ($word, $count) = split /\t/; $hash_ret{$word} += $count; } close($fh) or die "error: close '$fname': $!"; } return \%hash_ret; } # ----------------- mainline ------------------------------------------- @ARGV or die "usage: $0 file...\n"; my @llil_files = @ARGV; warn "llil2 start\n"; my $tstart1 = time; my $href = get_properties( \@llil_files ); my $tend1 = time; my $taken1 = $tend1 - $tstart1; warn "get_properties : $taken1 secs\n"; my $tstart2 = time; # Using two sorts is waaay faster than one in Perl for some reason! (see [id://11148545]) for my $key ( sort { $href->{$b} <=> $href->{$a} } sort keys %{$href} ) { print "$key\t$href->{$key}\n"; } my $tend2 = time; my $taken2 = $tend2 - $tstart2; my $taken = $tend2 - $tstart1; warn "sort + output : $taken2 secs\n"; warn "total : $taken secs\n"; ##

##

// llil2.cpp. C++ 11 version of Perl llil.pl.
// llil2.cpp is faster than llil.cpp while also clarifying limits:
//   - all keys should be less than 200 or so characters in length
//   - numbers are 64 bit integers (max: 9,223,372,036,854,775,807)
// g++ compile on Linux:
//    g++ -o llil2 -std=c++11 -Wall -O3 llil2.cpp
// This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64)
// that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe).
// Example run: llil2 tt1.txt tt2.txt tt3.txt >out.txt

#include 
#include 
#include 
#include 
#include 
#include 

#include 
#include 
#include 
#include 

#include 
#include 

#include 
#include 
#include 

static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile");

// ----------------------------------------------------------------------------

// Crude hack to see Windows Private Bytes in Task Manager by sleeping at
// program end (see also sleep hack at end of main)
//    #include 
//    #include 

// For some performance hacks to speed up C++ I/O see:
//   https://www.reddit.com/r/rust/comments/9xedap/how_to_achieve_fast_stdinstdout_io_suitable_for/
// The only one we use here is to prefer "\n" to std::endl to reduce stdout flushing

// ----------------------------------------------------------------------------

typedef long long llil_int_type;
using str_int_type     = std::pair;
using map_str_int_type = std::map;
using vec_str_int_type = std::vector;

// Mimic the Perl get_properties subroutine ----------------------------

// Limit line length and use lower level ANSI C functions to try to boost I/O performance
// TODO (maybe):
//   - reading: Try ::setvbuf(fh, NULL, _IOFBF, 65536) or some such on input files
//   - writing: Try ::setvbuf(stdout, stdout_buf, _IOFBF, sizeof(stdout_buf)) on stdout
//              ... or instead of writing to stdout, take an output file as a program argument
#define MAX_LINE_LEN_L 255

static void get_properties(
   int                nfiles,      //  in: the number of input files
   char*              fname[],     //  in: the input file names
   map_str_int_type&  hash_ret)    // out: a hash of properties
{
   FILE* fh;
   char line[MAX_LINE_LEN_L+1];
   char* word; char* count;
   for (int i = 0; i < nfiles; ++i) {
      fh = ::fopen(fname[i], "r");
      if (fh == NULL) {
         std::cerr << "Error opening '" << fname[i] << "'\n";
         return;
      }
      while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) {
         word  = ::strtok(line, "\t");
         count = ::strtok(NULL, "\n");
         hash_ret[word] += ::atoll(count);
      }
      ::fclose(fh);
   }
}

// ---------------------------------------------------------------------

int main(int argc, char* argv[])
{
   if (argc < 2) {
      std::cerr << "usage: llil2 file1 file2 ... >out.txt\n";
      return 1;
   }

   std::cerr << "llil2 start\n";
   time_t tstart1 = ::time(NULL);

   // Create the hash of properties
   map_str_int_type hash_ret;
   get_properties(argc - 1, &argv[1], hash_ret);
   time_t tend1 = ::time(NULL);
   long taken1  = static_cast(::difftime(tend1, tstart1) + 0.5);
   std::cerr << "get_properties : " << taken1 << " secs\n";

   // Sort descending by value, i.e. mimic this Perl code in C++:
   //   sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href}
   time_t tstart2 = ::time(NULL);
   vec_str_int_type v( hash_ret.begin(), hash_ret.end() );
   std::sort( v.begin(), v.end(),
      [](const str_int_type& left, const str_int_type& right) { return right.second != left.second ? right.second < left.second : left.first < right.first; }
   );

   // Output the merged properties
   for ( auto const& n : v ) { std::cout << n.first << '\t' << n.second << '\n'; }

   time_t tend2 = ::time(NULL);
   long taken2  = static_cast(::difftime(tend2, tstart2) + 0.5);
   long taken   = static_cast(::difftime(tend2, tstart1) + 0.5);
   std::cerr << "sort + output  : " << taken2 << " secs\n";
   std::cerr << "total          : " << taken << " secs\n";

   // Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly)
   //   std::this_thread::sleep_for(std::chrono::milliseconds(90000000));

   return 0;
}

##

##

get_properties : 11 secs
sort + output  : 74 secs
total          : 85 secs

##

##

get_properties : 11 secs
sort + output  : 25 secs
total          : 36 secs

##

##

get_properties : 10 secs
sort + output  : 20 secs
total          : 30 secs

##

##

get_properties : 9 secs
sort + output  : 7 secs
total          : 16 secs

##

##

get_properties : 6 secs
sort + output  : 6 secs
total          : 12 secs

##

##

using map_str_int_type = std::unordered_map;
// to (llil2a.cpp):
using map_str_int_type = std::map;

##

##

get_properties : 4 secs
sort + output  : 5 secs
total          : 9 secs

##

##

sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href}

##

##

sort { $href->{$b} <=> $href->{$a} } sort keys %{$href} )