# create-test-files.pl use strict; use warnings; sub build_file { my ( $file, $data ) = @_; open( my $fh, '>', $file ) or die "open '$file': $!"; print {$fh} $data or die "write '$file': $!"; } my $tt_1_data = <<"LLiL"; camel\t50 dromedary\t70 pearl\t42 LLiL my $tt_2_data = <<"LLiL"; dromedary\t3 kibitzer\t1000 dromedary\t2 camel\t19 dromedary\t1 LLiL my %test_files = ( 'llil-1.txt' => $tt_1_data, 'llil-2.txt' => $tt_2_data ); for my $fname (sort keys %test_files) { print STDERR "Create test file '$fname'..."; unlink($fname); build_file( $fname, $test_files{$fname} ); print STDERR "done.\n"; } #### package LLiL; use strict; use warnings; # Read a LLiL-format file. # Return the number of lines in the file or -1 if the file could not be opened # Update $hash_ret, a reference to a hash of properties sub get_properties { my $fname = shift; # in: a LLiL-format filename my $hash_ret = shift; # inout: a reference to a hash of properties my $cnt = 0; open( my $fh, '<', $fname ) or return -1; while (<$fh>) { ++$cnt; chomp; my ($word, $count) = split /\t/; $hash_ret->{$word} += $count; } close($fh); return $cnt; } # Note: Some extra validation that could be done in get_properties() above # ( not done because, to allow the code to run as fast as possible, # get_properties assumes the input data adheres to the LLiL spec, # that is, each line matches ^[a-z]+\t\d+$ ): # s/^\s+//; s/\s+$//; # remove leading and trailing whitespace # next unless length; # ignore empty lines # $word =~ /^[a-z]+$/ or die "error: invalid word '$_' (must contain [a-z] only)"; # $count =~ /^\d+$/ or die "error: invalid count '$_' (must contain [0-9] only)"; 1; #### # llil.t # Simple unit test of get_properties() function in LLiL.pm. # Normal run of this test : prove -v -I . llil.t # Can also run with : perl -I . llil.t # Note: before running this test, create the test files # llil-1.txt and llil-2.txt by running: perl create-test-files.pl use strict; use warnings; use LLiL; use Test::More; my $ntests = 5; plan tests => $ntests; my $expected_href = { 'camel' => 69, 'dromedary' => 76, 'kibitzer' => 1000, 'pearl' => 42 }; my %hash_ret; my $href = \%hash_ret; # Error tests { my $n = LLiL::get_properties( 'non-existent-file', $href ); cmp_ok( $n, '==', -1, "get_properties non existent file return value" ); } # Normal tests { my $n = LLiL::get_properties( 'llil-1.txt', $href ); cmp_ok( $n, '==', 3, "get_properties file 1 return value" ); $n = LLiL::get_properties( 'llil-2.txt', $href ); cmp_ok( $n, '==', 5, "get_properties file 2 return value" ); cmp_ok( scalar(%{$href}), '==', 4, "number of items in hash" ); is_deeply( $href, $expected_href, "hash content" ); } #### perl -I . llil.t #### prove -v -I . llil.t #### > prove -v -I . llil.t llil.t .. 1..5 ok 1 - get_properties non existent file return value ok 2 - get_properties file 1 return value ok 3 - get_properties file 2 return value ok 4 - number of items in hash ok 5 - hash content ok All tests successful. Files=1, Tests=5, 0 wallclock secs ( 0.00 usr + 0.00 sys = 0.00 CPU) Result: PASS #### $ sudo apt-get -y install cmake $ cd $HOME/local-catch2 $ git clone https://github.com/catchorg/Catch2.git $ cd Catch2 $ cmake -Bbuild -H. -DBUILD_TESTING=OFF $ sudo cmake --build build/ --target install #### clang++ -o tcatch -std=c++20 -Wall -O3 -I "$HOME/local-parallel-hashmap/parallel-hashmap" -I "$HOME/local-boost/boost_1_81_0" -I /usr/local/include/catch2 tcatch.cpp /usr/local/lib/libCatch2Main.a /usr/local/lib/libCatch2.a #### ./tcatch --success --durations yes #### Randomness seeded to: 640760095 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ tcatch is a Catch2 v3.3.1 host application. Run with -? for options ------------------------------------------------------------------------------- Error tests ------------------------------------------------------------------------------- tcatch.cpp:149 ............................................................................... tcatch.cpp:151: PASSED: REQUIRE( get_properties( "non-existent-file", mymap ) == -1 ) with expansion: -1 == -1 0.000 s: Error tests ------------------------------------------------------------------------------- Normal tests ------------------------------------------------------------------------------- tcatch.cpp:154 ............................................................................... tcatch.cpp:156: PASSED: REQUIRE( get_properties( "llil-1.txt", mymap ) == 3 ) with expansion: 3 == 3 tcatch.cpp:157: PASSED: REQUIRE( get_properties( "llil-2.txt", mymap ) == 5 ) with expansion: 5 == 5 tcatch.cpp:164: PASSED: REQUIRE_THAT( myvec, Catch::Matchers::UnorderedEquals( vec_str_int_type{ { str_type { "camel" }, 69 }, { str_type { "dromedary" }, 76 }, { str_type { "kibitzer" }, 1000 }, { str_type { "pearl" }, 42 } } ) ) with expansion: { {?}, {?}, {?}, {?} } UnorderedEquals: { {?}, {?}, {?}, {?} } 0.000 s: Normal tests =============================================================================== All tests passed (4 assertions in 2 test cases) #### // tcatch.cpp // Example run: ./tcatch --success --durations yes // For doco on Catch2 TEST_CASE and SECTION see: // https://github.com/catchorg/Catch2/blob/devel/docs/test-cases-and-sections.md #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // ---------------------------------------------------------------------------- typedef int_fast64_t llil_int_type; // All words in big1.txt, big2.txt, big3.txt are <= 6 chars in length. // big.txt max word length is 6 // long.txt max word length is 208 // // Based on rough benchmarking, the short fixed string hack below is only // worth trying for MAX_STR_LEN_L up to about 22. // See also https://backlinko.com/google-keyword-study // // To use (limited length) fixed length strings uncomment the next line. // #define MAX_STR_LEN_L 10 #ifdef MAX_STR_LEN_L struct str_type : std::array { bool operator==( const str_type& o ) const { return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) == 0; } bool operator<( const str_type& o ) const { return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) < 0; } }; #else struct str_type : std::basic_string { bool operator==( const str_type& o ) const { return ::strcmp(this->data(), o.data()) == 0; } bool operator<( const str_type& o ) const { return ::strcmp(this->data(), o.data()) < 0; } }; #endif using str_int_type = std::pair; using vec_str_int_type = std::vector; // inject specialization of std::hash for str_type into namespace std namespace std { template<> struct hash { std::size_t operator()( str_type const& v ) const noexcept { #if 0 return boost::hash_range( v.cbegin(), v.cend() ); #else std::basic_string_view bv { reinterpret_cast(v.data()), v.size() * sizeof(char) }; return std::hash>()(bv); #endif } }; } // Test with std::map, std::unordered_map or phmap::parallel_flat_hash_map #define MT_STD_MAP_L 0 #define MT_STD_UNORDERED_MAP_L 1 #define MT_PARALLEL_FLAT_HASH_MAP_L 2 // Uncomment one of the three map types below #define MAP_TYPE_L MT_STD_MAP_L // #define MAP_TYPE_L MT_STD_UNORDERED_MAP_L // #define MAP_TYPE_L MT_PARALLEL_FLAT_HASH_MAP_L #if MAP_TYPE_L == MT_STD_MAP_L #include using map_str_int_type = std::map; #elif MAP_TYPE_L == MT_STD_UNORDERED_MAP_L #include using map_str_int_type = std::unordered_map; #elif MAP_TYPE_L == MT_PARALLEL_FLAT_HASH_MAP_L #include // create the parallel_flat_hash_map without internal mutexes using map_str_int_type = phmap::parallel_flat_hash_map< str_type, llil_int_type, phmap::priv::hash_default_hash, phmap::priv::hash_default_eq, phmap::priv::Allocator>, 8, phmap::NullMutex >; #else #error "Unsupported map_str_int_type" #endif // Simple RAII timer ----------------------------------------------------------- // Create a MyTimer object in a scope: // { // MyTimer tt; // ... // } // to automatically print the time taken in the block to stderr #include inline double elaspe_time( std::chrono::high_resolution_clock::time_point cend, std::chrono::high_resolution_clock::time_point cstart) { return double( std::chrono::duration_cast(cend - cstart).count() ) * 1e-3; } class MyTimer { public: MyTimer() { stnow_m = std::chrono::high_resolution_clock::now(); } ~MyTimer() { auto endnow = std::chrono::high_resolution_clock::now(); std::cerr << " (" << elaspe_time(endnow, stnow_m) << " seconds)\n"; } private: std::chrono::time_point stnow_m; }; // --------------------------------------------------------------------- #include "get_properties.inl" #include #include // Catch2 tests start here ----------------------------------- TEST_CASE( "Error tests" ) { map_str_int_type mymap; REQUIRE( get_properties( "non-existent-file", mymap ) == -1 ); } TEST_CASE( "Normal tests" ) { map_str_int_type mymap; REQUIRE( get_properties( "llil-1.txt", mymap ) == 3 ); REQUIRE( get_properties( "llil-2.txt", mymap ) == 5 ); vec_str_int_type myvec( mymap.begin(), mymap.end() ); REQUIRE_THAT( myvec, Catch::Matchers::UnorderedEquals( vec_str_int_type{ { str_type { "camel" }, 69 }, { str_type { "dromedary" }, 76 }, { str_type { "kibitzer" }, 1000 }, { str_type { "pearl" }, 42 } } )); } #### // get_properties.inl // Note: str_type, llil_int_type and map_str_int_type are not defined here. // llil_int_type is normally defined as: typedef int_fast64_t llil_int_type; // Note: int_fast64_t is defined in // map_str_int_type is keyed by str_type with value of llil_int_type. // These three types must be defined by the code that includes this .inl file. // map_str_int_type can be many different types, std::map, std::unordered_map, ... // so long as all operations on map_upd below are supported. #include #include #include #include inline llil_int_type fast_atoll64( const char* str ) { llil_int_type val = 0; // int sign = 0; // if ( *str == '-' ) { // sign = 1, ++str; // } uint8_t digit; while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit; // return sign ? -val : val; return val; } // Limit line length and use ANSI C functions to try to boost performance #define MAX_LINE_LEN_L 255 // Update map_upd with the properties found in file fname // Return the number of properties in file fname (or -1 if fname could not be opened) static llil_int_type get_properties( const char* fname, // in : the input file name map_str_int_type& map_upd // inout: the map to be updated ) { std::array line; char* found; llil_int_type count; llil_int_type nprop = 0; FILE* fh = ::fopen(fname, "r"); if ( fh == NULL ) return -1; while ( ::fgets( line.data(), static_cast(MAX_LINE_LEN_L), fh ) != NULL ) { ++nprop; found = std::find( line.begin(), line.end(), '\t' ); count = fast_atoll64( found + 1 ); // Note: using emplace() is faster than map_upd[fixword] += count; #ifdef MAX_STR_LEN_L str_type fixword {}; // {} initializes all elements of fixword to '\0' std::copy( line.begin(), found, fixword.begin() ); const auto [it, success] = map_upd.emplace( fixword, count ); #else *found = '\0'; const auto [it, success] = map_upd.emplace(str_type{ line.data() }, count); #endif if (!success) it->second += count; } ::fclose(fh); return nprop; }