# obtain abseil-cpp git clone --depth=1 https://github.com/abseil/abseil-cpp.git cd abseil-cpp mkdir build && cd build # build shared libs cmake \ -DCMAKE_INSTALL_PREFIX="${HOME}/abseil" \ -DBUILD_SHARED_LIBS=ON \ -DCMAKE_SHARED_LINKER_FLAGS="-Wl,-rpath=${HOME}/abseil/lib64" \ -DABSL_BUILD_TESTING=OFF \ -DABSL_PROPAGATE_CXX_STD=ON \ -DABSL_USE_GOOGLETEST_HEAD=OFF \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_CXX_FLAGS_RELEASE="-O3 -DNDEBUG" \ -DCMAKE_CXX_STANDARD=20 \ .. cmake --build . --parallel=8 --target install # build static libs (optional) rm -fr ../build/* cmake \ -DCMAKE_INSTALL_PREFIX="${HOME}/abseil" \ -DABSL_BUILD_TESTING=OFF \ -DABSL_PROPAGATE_CXX_STD=ON \ -DABSL_USE_GOOGLETEST_HEAD=OFF \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_CXX_FLAGS_RELEASE="-O3 -DNDEBUG" \ -DCMAKE_CXX_STANDARD=20 \ .. cmake --build . --parallel=8 --target install #### // tdriver.cc update // https://perlmonks.org/?node_id=11150974 // - based on https://www.perlmonks.org/?node_id=11150945 // // Enhancements made on 2023-03-14: // - Consistent floating precision to stderr by showing trailing zeros. // - Added comments for building with abseil shared/static libraries. // - Added phmap::flat_hash_map to the list, faster than absl::flat_hash_map. // - Added ankerl::unordered_dense::map to the mix, faster than phmap::flat_hash_map. // - Added inclusion of absl::Hash with phmap, also used by ankerl behind the scene. // Note: phmap::parallel_flat_hash_map is the fastest of the bunch running parallel. // - Added duration time. // Enhancements made on 2023-03-21: // - Added MemoryMapped option (enable by changing #if 0 to #if 1; line 275) // // Building: // - std::map, std::unordered_map, boost::unordered_map // clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc // // - phmap::flat_hash_map, phmap::parallel_flat_hash_map // clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I./parallel-hashmap // // - ankerl::unordered_dense::map // clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I./unordered_dense/include // // Abseil shared libs: (uncomment MT_USE_ABSL_HASH_L below) // - phmap with absl::Hash // clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I./parallel-hashmap -I${HOME}/abseil/include -L${HOME}/abseil/lib64 -Wl,-rpath=${HOME}/abseil/lib64 -labsl_hash // // - ankerl::unordered_dense::map with absl::Hash // clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I./unordered_dense/include -I${HOME}/abseil/include -L${HOME}/abseil/lib64 -Wl,-rpath=${HOME}/abseil/lib64 -labsl_hash // // - absl::flat_hash_map or absl::node_hash_map with absl::Hash // clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I${HOME}/abseil/include -L${HOME}/abseil/lib64 -Wl,-rpath=${HOME}/abseil/lib64 -labsl_hash -labsl_raw_hash_set -labsl_raw_logging_internal // // Abseil static libs: (uncomment MT_USE_ABSL_HASH_L below) // - phmap with absl::Hash // clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I./parallel-hashmap -I${HOME}/abseil/include -L${HOME}/abseil/lib64 -l:libabsl_hash.a -l:libabsl_low_level_hash.a -l:libabsl_city.a // // - ankerl::unordered_dense::map with absl::Hash // clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I./unordered_dense/include -I${HOME}/abseil/include -L${HOME}/abseil/lib64 -l:libabsl_hash.a -l:libabsl_low_level_hash.a -l:libabsl_city.a // // - absl::flat_hash_map or absl::node_hash_map with absl::Hash // clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc -I${HOME}/abseil/include -L${HOME}/abseil/lib64 -l:libabsl_hash.a -l:libabsl_low_level_hash.a -l:libabsl_raw_hash_set.a -l:libabsl_raw_logging_internal.a -l:libabsl_city.a #include #include #include #include #include #include #include #include #include #include #include #include #include // #include #include #include #include #include // Test with std::map, std::unordered_map or phmap::parallel_flat_hash_map // or boost::unorderedmap or absl::flat_hash_map #define MT_STD_MAP_L 0 #define MT_STD_UNORDERED_MAP_L 1 #define MT_FLAT_HASH_MAP_L 2 #define MT_PARALLEL_FLAT_HASH_MAP_L 3 #define MT_BOOST_UNORDERED_MAP_L 4 #define MT_GOOGLE_HASH_MAP_L 5 #define MT_GOOGLE_NODE_MAP_L 6 #define MT_ANKERL_UNORDERED_DENSE_MAP_L 7 // Uncomment to use absl::Hash function object // #define MT_USE_ABSL_HASH_L 1 #ifdef MT_USE_ABSL_HASH_L #include #endif // Uncomment one of the map types below // #define MAP_TYPE_L MT_STD_MAP_L // #define MAP_TYPE_L MT_STD_UNORDERED_MAP_L // #define MAP_TYPE_L MT_FLAT_HASH_MAP_L #define MAP_TYPE_L MT_PARALLEL_FLAT_HASH_MAP_L // #define MAP_TYPE_L MT_BOOST_UNORDERED_MAP_L // #define MAP_TYPE_L MT_GOOGLE_HASH_MAP_L // #define MAP_TYPE_L MT_GOOGLE_NODE_MAP_L // #define MAP_TYPE_L MT_ANKERL_UNORDERED_DENSE_MAP_L // ---------------------------------------------------------------------------- typedef int_fast64_t llil_int_type; // All words in big1.txt, big2.txt, big3.txt are <= 6 chars in length. // big.txt max word length is 6 // long.txt max word length is 208 // // Based on rough benchmarking, the short fixed string hack below is only // worth trying for MAX_STR_LEN_L up to about 22. // See also https://backlinko.com/google-keyword-study // // To use (limited length) fixed length strings uncomment the next line. #define MAX_STR_LEN_L 6 #ifdef MAX_STR_LEN_L struct str_type : std::array { bool operator==( const str_type& o ) const { return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) == 0; } bool operator<( const str_type& o ) const { return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) < 0; } }; #else struct str_type : std::basic_string { bool operator==( const str_type& o ) const { return ::strcmp(this->data(), o.data()) == 0; } bool operator<( const str_type& o ) const { return ::strcmp(this->data(), o.data()) < 0; } }; #endif using str_int_type = std::pair; using vec_str_int_type = std::vector; // inject specialization of std::hash or absl::Hash for str_type into namespace std namespace std { template<> struct hash { std::size_t operator()( str_type const& v ) const noexcept { #if 0 return boost::hash_range( v.cbegin(), v.cend() ); #else std::basic_string_view bv { reinterpret_cast(v.data()), v.size() * sizeof(char) }; #ifdef MT_USE_ABSL_HASH_L return absl::Hash>()(bv); #else return std::hash>()(bv); #endif #endif } }; } #if MAP_TYPE_L == MT_STD_MAP_L // https://en.cppreference.com/w/cpp/container/map #include using map_str_int_type = std::map; #elif MAP_TYPE_L == MT_STD_UNORDERED_MAP_L // https://en.cppreference.com/w/cpp/container/unordered_map #include using map_str_int_type = std::unordered_map; #elif MAP_TYPE_L == MT_FLAT_HASH_MAP_L // https://github.com/greg7mdp/parallel-hashmap // If you prefer the default hash to be the Abseil hash framework, // include the Abseil headers before `phmap.h` and define the // preprocessor macro `PHMAP_USE_ABSL_HASH`. // - refer to the README and parallel_hashmap/phmap_fwd_decl.h #ifdef MT_USE_ABSL_HASH_L #define PHMAP_USE_ABSL_HASH 1 #endif #include using map_str_int_type = phmap::flat_hash_map; #elif MAP_TYPE_L == MT_PARALLEL_FLAT_HASH_MAP_L // https://github.com/greg7mdp/parallel-hashmap // If you prefer the default hash to be the Abseil hash framework, // include the Abseil headers before `phmap.h` and define the // preprocessor macro `PHMAP_USE_ABSL_HASH`. // - refer to the README and parallel_hashmap/phmap_fwd_decl.h #ifdef MT_USE_ABSL_HASH_L #define PHMAP_USE_ABSL_HASH 1 #endif #include // create the parallel_flat_hash_map without internal mutexes using map_str_int_type = phmap::parallel_flat_hash_map< str_type, llil_int_type, phmap::priv::hash_default_hash, phmap::priv::hash_default_eq, phmap::priv::Allocator>, 8, phmap::NullMutex >; #elif MAP_TYPE_L == MT_BOOST_UNORDERED_MAP_L // https://www.boost.org/doc/libs/1_80_0/libs/unordered/doc/html/unordered.html #include using map_str_int_type = boost::unordered_map; #elif MAP_TYPE_L == MT_GOOGLE_HASH_MAP_L // https://github.com/abseil/abseil-cpp // By default, `flat_hash_map` uses the `absl::Hash` hashing framework. // - refer to absl/container/internal/hash_function_defaults.h #ifndef MT_USE_ABSL_HASH_L #define MT_USE_ABSL_HASH_L 1 #endif #include using map_str_int_type = absl::flat_hash_map; #elif MAP_TYPE_L == MT_GOOGLE_NODE_MAP_L // https://github.com/abseil/abseil-cpp // By default, `node_hash_map` uses the `absl::Hash` hashing framework. // - refer to absl/container/internal/hash_function_defaults.h #ifndef MT_USE_ABSL_HASH_L #define MT_USE_ABSL_HASH_L 1 #endif #include using map_str_int_type = absl::node_hash_map; #elif MAP_TYPE_L == MT_ANKERL_UNORDERED_DENSE_MAP_L // https://github.com/martinus/unordered_dense #include using map_str_int_type = ankerl::unordered_dense::map; #else #error "Unsupported map_str_int_type" #endif // Simple RAII timer ----------------------------------------------------------- // Create a MyTimer object in a scope: // { // MyTimer tt; // ... // } // to automatically print the time taken in the block to stderr #include inline double elaspe_time( std::chrono::high_resolution_clock::time_point cend, std::chrono::high_resolution_clock::time_point cstart) { return double( std::chrono::duration_cast(cend - cstart).count() ) * 1e-3; } class MyTimer { public: MyTimer() { stnow_m = std::chrono::high_resolution_clock::now(); } ~MyTimer() { auto endnow = std::chrono::high_resolution_clock::now(); std::cerr << " (" << elaspe_time(endnow, stnow_m) << " seconds)\n"; } private: std::chrono::time_point stnow_m; }; // --------------------------------------------------------------------- // #include "get_properties.inl" // get_properties.inl // Note: str_type, llil_int_type and map_str_int_type are not defined here. // llil_int_type is normally defined as: typedef int_fast64_t llil_int_type; // Note: int_fast64_t is defined in // map_str_int_type is keyed by str_type with value of llil_int_type. // These three types must be defined by the code that includes this .inl file. // map_str_int_type can be many different types, std::map, std::unordered_map, ... // so long as all operations on map_upd below are supported. #include #include #include #include #if 0 // Obtain the Portable Memory Mapping C++ Class. // git clone --depth=1 https://github.com/stbrumme/portable-memory-mapping // // Copy two files to you work dir: *.cpp and *.h // cp https://github.com/stbrumme/portable-memory-mapping/MemoryMapped.* . // // Compile on Linux // clang++ -o tdriver -std=c++20 -Wall -O3 tdriver.cc MemoryMapped.cpp ... inline llil_int_type fast_atoll64( const char* str, uint64_t* pos ) { llil_int_type val = 0; // int sign = 0; // if ( *str == '-' ) { // sign = 1, ++str, ++*pos; // } uint8_t digit; while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit, ++*pos; // return sign ? -val : val; return val; } #include "MemoryMapped.h" // Update map_upd with the properties found in file fname // Return the number of properties in file fname (or -1 if fname could not be opened) static llil_int_type get_properties( const char* fname, // in : the input file name map_str_int_type& map_upd // inout: the map to be updated ) { // Map file to memory. MemoryMapped data(fname, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); if (!data.isValid()) return -1; // Get raw pointer to mapped memory. char* buffer = (char*) data.getData(); uint64_t filesize = data.size(); // Process mapped file. uint64_t length, strpos = 0; llil_int_type count; llil_int_type nprop = 0; for (uint64_t pos = 0; pos < filesize; pos++) { if (buffer[pos] == '\t') { ++nprop; length = pos - strpos; count = fast_atoll64( &buffer[pos + 1], &pos ); #ifdef MAX_STR_LEN_L str_type fixword {}; // {} initializes all elements of fixword to '\0' ( length <= MAX_STR_LEN_L ) ? ::memcpy( fixword.data(), &buffer[strpos], length ) : ::memcpy( fixword.data(), &buffer[strpos], MAX_STR_LEN_L ); const auto [it, success] = map_upd.emplace(fixword, count); #else const auto [it, success] = map_upd.emplace( str_type(&buffer[strpos], length), count ); #endif if ( !success ) it->second += count; strpos = pos = ( buffer[pos + 1] == '\r' ) ? pos + 3 : pos + 2; } } data.close(); return nprop; } #else inline llil_int_type fast_atoll64( const char* str ) { llil_int_type val = 0; // int sign = 0; // if ( *str == '-' ) { // sign = 1, ++str; // } uint8_t digit; while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit; // return sign ? -val : val; return val; } // Limit line length and use ANSI C functions to try to boost performance #define MAX_LINE_LEN_L 255 // Update map_upd with the properties found in file fname // Return the number of properties in file fname (or -1 if fname could not be opened) static llil_int_type get_properties( const char* fname, // in : the input file name map_str_int_type& map_upd // inout: the map to be updated ) { std::array line; char* found; llil_int_type count; llil_int_type nprop = 0; FILE* fh = ::fopen(fname, "r"); if ( fh == NULL ) return -1; while ( ::fgets( line.data(), static_cast(MAX_LINE_LEN_L), fh ) != NULL ) { ++nprop; found = std::find( line.begin(), line.end(), '\t' ); count = fast_atoll64( found + 1 ); // Note: using emplace() is faster than map_upd[fixword] += count; #ifdef MAX_STR_LEN_L str_type fixword {}; // {} initializes all elements of fixword to '\0' std::copy( line.begin(), found, fixword.begin() ); const auto [it, success] = map_upd.emplace( fixword, count ); #else *found = '\0'; const auto [it, success] = map_upd.emplace(str_type{ line.data() }, count); #endif if (!success) it->second += count; } ::fclose(fh); return nprop; } #endif // --------------------------------------------------------------------- int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: tdriver file1 file2 ... >out.txt\n"; return 1; } std::cerr << std::setprecision(3) << std::setiosflags(std::ios::fixed); #ifdef MAX_STR_LEN_L std::cerr << "tdriver (fixed string length=" << MAX_STR_LEN_L << ") start\n"; #else std::cerr << "tdriver start\n"; #endif #if MAP_TYPE_L == MT_STD_MAP_L std::cerr << "use std::map\n"; #elif MAP_TYPE_L == MT_STD_UNORDERED_MAP_L std::cerr << "use std::unordered_map\n"; #elif MAP_TYPE_L == MT_FLAT_HASH_MAP_L std::cerr << "use phmap::flat_hash_map\n"; #elif MAP_TYPE_L == MT_PARALLEL_FLAT_HASH_MAP_L std::cerr << "use phmap::parallel_flat_hash_map\n"; #elif MAP_TYPE_L == MT_BOOST_UNORDERED_MAP_L std::cerr << "use boost::unordered_map\n"; #elif MAP_TYPE_L == MT_GOOGLE_HASH_MAP_L std::cerr << "use absl::flat_hash_map\n"; #elif MAP_TYPE_L == MT_GOOGLE_NODE_MAP_L std::cerr << "use absl::node_hash_map\n"; #elif MAP_TYPE_L == MT_ANKERL_UNORDERED_DENSE_MAP_L std::cerr << "use ankerl::unordered_dense::map\n"; #else #error "Unsupported map_str_int_type" #endif #ifdef MT_USE_ABSL_HASH_L std::cerr << "use absl::Hash function object in ctor or string view\n"; #endif // Get the list of input files from the command line int nfiles = argc - 1; char** fname = &argv[1]; MyTimer tt_main; map_str_int_type mymap; for ( int i = 0; i < nfiles; ++i ) { MyTimer tt; llil_int_type nlines = get_properties(fname[i], mymap); std::cerr << fname[i] << ": nlines=" << nlines; } // Store the properties into a vector vec_str_int_type propvec( mymap.begin(), mymap.end() ); mymap.clear(); // Sort the vector by (count) in reverse order, (name) in lexical order std::sort( // Standard sort propvec.begin(), propvec.end(), [](const str_int_type& left, const str_int_type& right) { return left.second != right.second ? left.second > right.second : left.first < right.first; } ); // Output the sorted vector #ifdef MAX_STR_LEN_L for ( auto const& n : propvec ) ::printf("%.*s\t%ld\n", MAX_STR_LEN_L, n.first.data(), n.second); #else for ( auto const& n : propvec ) std::cout << n.first << "\t" << n.second << "\n"; #endif std::cerr << "duration:"; return 0; } #### (0) $ ./tdriver big[1-6].txt | cksum tdriver (fixed string length=6) start use std::map big1.txt: nlines=3515200 ( 0.700 seconds) big2.txt: nlines=3515200 ( 0.640 seconds) big3.txt: nlines=3515200 ( 0.711 seconds) big4.txt: nlines=3515200 ( 0.774 seconds) big5.txt: nlines=3515200 ( 0.837 seconds) big6.txt: nlines=3515200 ( 0.901 seconds) duration: ( 9.409 seconds) 3181500212 183490425 (1) $ ./tdriver big[1-6].txt | cksum tdriver (fixed string length=6) start use std::unordered_map big1.txt: nlines=3515200 ( 0.986 seconds) big2.txt: nlines=3515200 ( 1.224 seconds) big3.txt: nlines=3515200 ( 0.844 seconds) big4.txt: nlines=3515200 ( 1.724 seconds) big5.txt: nlines=3515200 ( 0.824 seconds) big6.txt: nlines=3515200 ( 0.889 seconds) duration: (14.335 seconds) 3181500212 183490425 (2) $ ./tdriver big[1-6].txt | cksum tdriver (fixed string length=6) start use phmap::flat_hash_map w/ absl::Hash big1.txt: nlines=3515200 ( 0.307 seconds) ( 0.300 seconds) big2.txt: nlines=3515200 ( 0.419 seconds) ( 0.415 seconds) big3.txt: nlines=3515200 ( 0.529 seconds) ( 0.517 seconds) big4.txt: nlines=3515200 ( 0.461 seconds) ( 0.465 seconds) big5.txt: nlines=3515200 ( 0.662 seconds) ( 0.632 seconds) big6.txt: nlines=3515200 ( 0.506 seconds) ( 0.508 seconds) duration: ( 6.624 seconds) ( 6.501 seconds) 3181500212 183490425 (3) $ ./tdriver big[1-6].txt | cksum tdriver (fixed string length=6) start use phmap::parallel_flat_hash_map w/ absl::Hash big1.txt: nlines=3515200 ( 0.370 seconds) ( 0.363 seconds) big2.txt: nlines=3515200 ( 0.511 seconds) ( 0.504 seconds) big3.txt: nlines=3515200 ( 0.645 seconds) ( 0.629 seconds) big4.txt: nlines=3515200 ( 0.597 seconds) ( 0.557 seconds) big5.txt: nlines=3515200 ( 0.813 seconds) ( 0.773 seconds) big6.txt: nlines=3515200 ( 0.604 seconds) ( 0.602 seconds) duration: ( 7.221 seconds) ( 7.079 seconds) 3181500212 183490425 (4) $ ./tdriver big[1-6].txt | cksum tdriver (fixed string length=6) start use boost::unordered_map big1.txt: nlines=3515200 ( 0.839 seconds) big2.txt: nlines=3515200 ( 1.012 seconds) big3.txt: nlines=3515200 ( 1.280 seconds) big4.txt: nlines=3515200 ( 0.809 seconds) big5.txt: nlines=3515200 ( 1.885 seconds) big6.txt: nlines=3515200 ( 0.763 seconds) duration: (13.989 seconds) 3181500212 183490425 (5) $ ./tdriver big[1-6].txt | cksum tdriver (fixed string length=6) start use absl::flat_hash_map use absl::Hash function object in ctor or string view big1.txt: nlines=3515200 ( 0.383 seconds) big2.txt: nlines=3515200 ( 0.480 seconds) big3.txt: nlines=3515200 ( 0.594 seconds) big4.txt: nlines=3515200 ( 0.502 seconds) big5.txt: nlines=3515200 ( 0.762 seconds) big6.txt: nlines=3515200 ( 0.548 seconds) duration: ( 7.069 seconds) 3181500212 183490425 (6) $ ./tdriver big[1-6].txt | cksum tdriver (fixed string length=6) start use absl::node_hash_map use absl::Hash function object in ctor or string view big1.txt: nlines=3515200 ( 0.531 seconds) big2.txt: nlines=3515200 ( 0.687 seconds) big3.txt: nlines=3515200 ( 0.950 seconds) big4.txt: nlines=3515200 ( 0.597 seconds) big5.txt: nlines=3515200 ( 1.426 seconds) big6.txt: nlines=3515200 ( 0.634 seconds) duration: (12.332 seconds) 3181500212 183490425 (7) $ ./tdriver big[1-6].txt | cksum tdriver (fixed string length=6) start use ankerl::unordered_dense::map w/ absl::Hash big1.txt: nlines=3515200 ( 0.480 seconds) ( 0.388 seconds) big2.txt: nlines=3515200 ( 0.627 seconds) ( 0.498 seconds) big3.txt: nlines=3515200 ( 0.358 seconds) ( 0.368 seconds) big4.txt: nlines=3515200 ( 0.969 seconds) ( 0.689 seconds) big5.txt: nlines=3515200 ( 0.380 seconds) ( 0.390 seconds) big6.txt: nlines=3515200 ( 0.373 seconds) ( 0.379 seconds) duration: ( 6.121 seconds) ( 5.795 seconds) 3181500212 183490425