# create-test-files.pl use strict; use warnings; sub build_file { my ( $file, $data ) = @_; open( my $fh, '>', $file ) or die "open '$file': $!"; print {$fh} $data or die "write '$file': $!"; } my $tt_1_data = <<"LLiL"; camel\t50 dromedary\t70 pearl\t42 LLiL my $tt_2_data = <<"LLiL"; dromedary\t3 kibitzer\t1000 dromedary\t2 camel\t19 dromedary\t1 LLiL my %test_files = ( 'llil-1.txt' => $tt_1_data, 'llil-2.txt' => $tt_2_data ); for my $fname (sort keys %test_files) { print STDERR "Create test file '$fname'..."; unlink($fname); build_file( $fname, $test_files{$fname} ); print STDERR "done.\n"; } ##

##

package LLiL;
use strict;
use warnings;

# Read a LLiL-format file.
# Return the number of lines in the file or -1 if the file could not be opened
# Update $hash_ret, a reference to a hash of properties
sub get_properties {
   my $fname    = shift;    # in:    a LLiL-format filename
   my $hash_ret = shift;    # inout: a reference to a hash of properties
   my $cnt = 0;
   open( my $fh, '<', $fname ) or return -1;
   while (<$fh>) {
      ++$cnt;
      chomp;
      my ($word, $count) =  split /\t/;
      $hash_ret->{$word} += $count;
   }
   close($fh);
   return $cnt;
}

# Note: Some extra validation that could be done in get_properties() above
#       ( not done because, to allow the code to run as fast as possible,
#         get_properties assumes the input data adheres to the LLiL spec,
#         that is, each line matches ^[a-z]+\t\d+$ ):
#   s/^\s+//; s/\s+$//;          # remove leading and trailing whitespace
#   next unless length;          # ignore empty lines
#   $word =~ /^[a-z]+$/ or die "error: invalid word  '$_' (must contain [a-z] only)";
#   $count =~ /^\d+$/   or die "error: invalid count '$_' (must contain [0-9] only)";

1;

##

##

# llil.t
# Simple unit test of get_properties() function in LLiL.pm.

# Normal run of this test : prove -v -I . llil.t
# Can also run with       : perl -I . llil.t

# Note: before running this test, create the test files
# llil-1.txt and llil-2.txt by running: perl create-test-files.pl

use strict;
use warnings;
use LLiL;

use Test::More;
my $ntests = 5;
plan tests => $ntests;

my $expected_href = {
   'camel'     => 69,
   'dromedary' => 76,
   'kibitzer'  => 1000,
   'pearl'     => 42
};

my %hash_ret;
my $href = \%hash_ret;

# Error tests
{
   my $n = LLiL::get_properties( 'non-existent-file', $href );
   cmp_ok( $n, '==', -1, "get_properties non existent file return value" );
}

# Normal tests
{
   my $n = LLiL::get_properties( 'llil-1.txt', $href );
   cmp_ok( $n, '==', 3, "get_properties file 1 return value" );
   $n = LLiL::get_properties( 'llil-2.txt', $href );
   cmp_ok( $n, '==', 5, "get_properties file 2 return value" );
   cmp_ok( scalar(%{$href}), '==', 4, "number of items in hash" );
   is_deeply( $href, $expected_href, "hash content" );
}

##

##

perl -I . llil.t

##

##

prove -v -I . llil.t

##

##

> prove -v -I . llil.t
llil.t ..
1..5
ok 1 - get_properties non existent file return value
ok 2 - get_properties file 1 return value
ok 3 - get_properties file 2 return value
ok 4 - number of items in hash
ok 5 - hash content
ok
All tests successful.
Files=1, Tests=5,  0 wallclock secs ( 0.00 usr +  0.00 sys =  0.00 CPU)
Result: PASS

##

##

$ sudo apt-get -y install cmake
$ cd $HOME/local-catch2
$ git clone https://github.com/catchorg/Catch2.git
$ cd Catch2
$ cmake -Bbuild -H. -DBUILD_TESTING=OFF
$ sudo cmake --build build/ --target install

##

##

clang++ -o tcatch -std=c++20 -Wall -O3 -I "$HOME/local-parallel-hashmap/parallel-hashmap" -I "$HOME/local-boost/boost_1_81_0" -I /usr/local/include/catch2 tcatch.cpp /usr/local/lib/libCatch2Main.a /usr/local/lib/libCatch2.a

##

##

./tcatch --success --durations yes

##

##

Randomness seeded to: 640760095

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
tcatch is a Catch2 v3.3.1 host application.
Run with -? for options

-------------------------------------------------------------------------------
Error tests
-------------------------------------------------------------------------------
tcatch.cpp:149
...............................................................................

tcatch.cpp:151: PASSED:
  REQUIRE( get_properties( "non-existent-file", mymap ) == -1 )
with expansion:
  -1 == -1

0.000 s: Error tests
-------------------------------------------------------------------------------
Normal tests
-------------------------------------------------------------------------------
tcatch.cpp:154
...............................................................................

tcatch.cpp:156: PASSED:
  REQUIRE( get_properties( "llil-1.txt", mymap ) == 3 )
with expansion:
  3 == 3

tcatch.cpp:157: PASSED:
  REQUIRE( get_properties( "llil-2.txt", mymap ) == 5 )
with expansion:
  5 == 5

tcatch.cpp:164: PASSED:
  REQUIRE_THAT( myvec, Catch::Matchers::UnorderedEquals( vec_str_int_type{ { str_type { "camel" }, 69 }, { str_type { "dromedary" }, 76 }, { str_type { "kibitzer" }, 1000 }, { str_type { "pearl" }, 42 } } ) )
with expansion:
  { {?}, {?}, {?}, {?} } UnorderedEquals: { {?}, {?}, {?}, {?} }

0.000 s: Normal tests
===============================================================================
All tests passed (4 assertions in 2 test cases)

##

##

// tcatch.cpp
// Example run: ./tcatch --success --durations yes
// For doco on Catch2 TEST_CASE and SECTION see:
//   https://github.com/catchorg/Catch2/blob/devel/docs/test-cases-and-sections.md

#include 
#include 
#include 
#include 
#include 
#include 

#include 
#include 
#include 

#include 
#include 
#include 
#include 

#include 

#include 
#include 
#include 
#include 

// ----------------------------------------------------------------------------

typedef int_fast64_t llil_int_type;

// All words in big1.txt, big2.txt, big3.txt are <= 6 chars in length.
// big.txt  max word length is 6
// long.txt max word length is 208
//
// Based on rough benchmarking, the short fixed string hack below is only
// worth trying for MAX_STR_LEN_L up to about 22.
// See also https://backlinko.com/google-keyword-study
//
// To use (limited length) fixed length strings uncomment the next line.
// #define MAX_STR_LEN_L 10

#ifdef MAX_STR_LEN_L
struct str_type : std::array {
   bool operator==( const str_type& o ) const {
      return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) == 0;
   }
   bool operator<( const str_type& o ) const {
      return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) < 0;
   }
};
#else
struct str_type : std::basic_string {
   bool operator==( const str_type& o ) const {
      return ::strcmp(this->data(), o.data()) == 0;
   }
   bool operator<( const str_type& o ) const {
      return ::strcmp(this->data(), o.data()) < 0;
   }
};
#endif

using str_int_type     = std::pair;
using vec_str_int_type = std::vector;

// inject specialization of std::hash for str_type into namespace std
namespace std {
   template<> struct hash {
      std::size_t operator()( str_type const& v ) const noexcept {
#if 0
         return boost::hash_range( v.cbegin(), v.cend() );
#else
         std::basic_string_view bv {
            reinterpret_cast(v.data()), v.size() * sizeof(char) };
         return std::hash>()(bv);
#endif
      }
   };
}

// Test with std::map, std::unordered_map or phmap::parallel_flat_hash_map
#define MT_STD_MAP_L                 0
#define MT_STD_UNORDERED_MAP_L       1
#define MT_PARALLEL_FLAT_HASH_MAP_L  2

// Uncomment one of the three map types below
#define MAP_TYPE_L  MT_STD_MAP_L
// #define MAP_TYPE_L  MT_STD_UNORDERED_MAP_L
// #define MAP_TYPE_L  MT_PARALLEL_FLAT_HASH_MAP_L

#if MAP_TYPE_L == MT_STD_MAP_L
#include 
using map_str_int_type = std::map;
#elif MAP_TYPE_L == MT_STD_UNORDERED_MAP_L
#include 
using map_str_int_type = std::unordered_map;
#elif MAP_TYPE_L == MT_PARALLEL_FLAT_HASH_MAP_L
#include 
// create the parallel_flat_hash_map without internal mutexes
using map_str_int_type = phmap::parallel_flat_hash_map<
   str_type, llil_int_type,
   phmap::priv::hash_default_hash,
   phmap::priv::hash_default_eq,
   phmap::priv::Allocator>,
   8, phmap::NullMutex
>;
#else
#error "Unsupported map_str_int_type"
#endif

// Simple RAII timer -----------------------------------------------------------
// Create a MyTimer object in a scope:
//    {
//       MyTimer tt;
//       ...
//    }
// to automatically print the time taken in the block to stderr

#include 

inline double elaspe_time(
   std::chrono::high_resolution_clock::time_point cend,
   std::chrono::high_resolution_clock::time_point cstart)
{
   return double( std::chrono::duration_cast(cend - cstart).count() ) * 1e-3;
}

class MyTimer {
public:
   MyTimer()  { stnow_m = std::chrono::high_resolution_clock::now(); }
   ~MyTimer() {
      auto endnow = std::chrono::high_resolution_clock::now();
      std::cerr << " (" << elaspe_time(endnow, stnow_m) << " seconds)\n";
   }
private:
   std::chrono::time_point stnow_m;
};

// ---------------------------------------------------------------------

#include "get_properties.inl"

#include 
#include 

// Catch2 tests start here -----------------------------------

TEST_CASE( "Error tests" ) {
   map_str_int_type mymap;
   REQUIRE( get_properties( "non-existent-file", mymap ) == -1 );
}

TEST_CASE( "Normal tests" ) {
   map_str_int_type mymap;
   REQUIRE( get_properties( "llil-1.txt", mymap ) == 3 );
   REQUIRE( get_properties( "llil-2.txt", mymap ) == 5 );
   vec_str_int_type myvec( mymap.begin(), mymap.end() );
   REQUIRE_THAT( myvec, Catch::Matchers::UnorderedEquals( vec_str_int_type{
      { str_type { "camel"     },   69 },
      { str_type { "dromedary" },   76 },
      { str_type { "kibitzer"  }, 1000 },
      { str_type { "pearl"     },   42 }
   } ));
}

##

##

// get_properties.inl
// Note: str_type, llil_int_type and map_str_int_type are not defined here.
// llil_int_type is normally defined as: typedef int_fast64_t llil_int_type;
// Note: int_fast64_t is defined in 
// map_str_int_type is keyed by str_type with value of llil_int_type.
// These three types must be defined by the code that includes this .inl file.
// map_str_int_type can be many different types, std::map, std::unordered_map, ...
// so long as all operations on map_upd below are supported.

#include 
#include 
#include 
#include 

inline llil_int_type fast_atoll64( const char* str )
{
   llil_int_type val  = 0;
   // int     sign = 0;
   // if ( *str == '-' ) {
   //    sign = 1, ++str;
   // }
   uint8_t digit;
   while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit;
   // return sign ? -val : val;
   return val;
}

// Limit line length and use ANSI C functions to try to boost performance
#define MAX_LINE_LEN_L 255

// Update map_upd with the properties found in file fname
// Return the number of properties in file fname (or -1 if fname could not be opened)
static llil_int_type get_properties(
   const char*        fname,     // in   : the input file name
   map_str_int_type&  map_upd    // inout: the map to be updated
)
{
   std::array line;
   char* found;
   llil_int_type count;
   llil_int_type nprop = 0;
   FILE* fh = ::fopen(fname, "r");
   if ( fh == NULL ) return -1;
   while ( ::fgets( line.data(), static_cast(MAX_LINE_LEN_L), fh ) != NULL ) {
      ++nprop;
      found = std::find( line.begin(), line.end(), '\t' );
      count = fast_atoll64( found + 1 );
      // Note: using emplace() is faster than map_upd[fixword] += count;
#ifdef MAX_STR_LEN_L
      str_type fixword {};  // {} initializes all elements of fixword to '\0'
      std::copy( line.begin(), found, fixword.begin() );
      const auto [it, success] = map_upd.emplace( fixword, count );
#else
      *found = '\0';
      const auto [it, success] = map_upd.emplace(str_type{ line.data() }, count);
#endif
      if (!success) it->second += count;
   }
   ::fclose(fh);
   return nprop;
}