// llil5vec-tbb.cpp // Based on llil4vec-tbb.cpp in https://perlmonks.com/?node_id=11149687 // // Vector version using the Intel TBB library // Note: TBB concurrent vector is best avoided, too much locking overhead // based on llil3vec-tbb-a.cpp https://perlmonks.com/?node_id=11149622 // 1. Capture time and diff via chrono. // 2. Key words null-terminated for MAX_STR_LEN_L. // 3. Concat strings using fast_io::concatln during output. // 4. Support NUM_THREADS environment variable. // 5. Merge local vector inside the parallel_for block. // This allows threads, completing get_properties, to merge immediately. // 6. Moved vec_loc instantiation inside for loop. // 7. Support running one thread, writing to propvec (no merging). // 8. Capture time for get and sort properties separately. // 9. Added define for using boost's parallel sort; requires clang++. // // To obtain the fast_io library (required dependency): // git clone --depth=1 https://github.com/cppfastio/fast_io // // Compiled on Linux with: // clang++ -o llil5vec-tbb -std=c++20 -Wall -O3 -I "$HOME/local-fast_io/fast_io/include" -I "$HOME/local-boost/boost_1_81_0" -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" llil5vec-tbb.cpp -l tbb // Also works with g++ but throws some compiler warnings about deprecated C++20 features. // Seems to run slightly faster when compiled with clang++ rather than g++ // Uncomment to use Intel TBB library #define USE_TBB_L 1 // Specify 0/1 to use boost's parallel sorting algorithm; faster than __gnu_parallel::sort. // https://www.boost.org/doc/libs/1_81_0/libs/sort/doc/html/sort/parallel.html // This requires the boost header files: e.g. devpkg-boost bundle on Clear Linux. // Note: I was able to build this program just by downloading and unpacking Boost locally // (no need to build it because the bits we use are header file only) #define USE_BOOST_PARALLEL_SORT 1 #include #include // The fast_io header must come after chrono, else build error: // "no member named 'concatln' in namespace 'fast_io'" #include #include #include #include #include #include #include #include #include #include #include #include #if USE_BOOST_PARALLEL_SORT > 0 #include #else #endif #include #include #ifdef USE_TBB_L #include #include #include #include #endif #include #include #include static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile"); // ---------------------------------------------------------------------------- // Crude hack to see Windows Private Bytes in Task Manager by sleeping at // program end (see also sleep hack at end of main) // #include // #include // ---------------------------------------------------------------------------- typedef long long llil_int_type; // Note: all words in big1.txt, big2.txt, big3.txt are <= 6 chars in length // To use (limited length) fixed length strings uncomment the next line // big.txt max word length is 6 // long.txt max word length is 208 // The standard big1.txt, big2.txt, big3.txt files all contain 3,515,200 lines #define N_LINES_BIG1_L 3515200 // Based on rough benchmarking, the short fixed string hack below is only // worth trying for MAX_STR_LEN_L up to about 22. // See also https://backlinko.com/google-keyword-study #define MAX_STR_LEN_L 6 #ifdef MAX_STR_LEN_L using str_type = std::array; #else using str_type = std::string; #endif using str_int_type = std::pair; using int_str_type = std::pair; using vec_str_int_type = std::vector; using vec_int_str_type = std::vector; // fast_atoll64 -------------------------------------------------------- // // https://stackoverflow.com/questions/16826422/ // c-most-efficient-way-to-convert-string-to-int-faster-than-atoi inline int64_t fast_atoll64( const char* str ) { int64_t val = 0; int sign = 0; if ( *str == '-' ) { sign = 1, ++str; } uint8_t digit; while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit; return sign ? -val : val; } // Mimic the Perl get_properties subroutine ---------------------------- // Limit line length and use ANSI C functions to try to boost performance #define MAX_LINE_LEN_L 255 static void get_properties( const char* fname, // in: the input file name vec_int_str_type& vec_ret) // out: a vector of properties { FILE* fh; char line[MAX_LINE_LEN_L+1]; char* found; llil_int_type count; fh = ::fopen(fname, "r"); if (fh == NULL) { std::cerr << "Error opening '" << fname << "' : errno=" << errno << "\n"; return; } while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) { found = ::strchr(line, '\t'); // fast_atoll64 is a touch faster than ::atoll count = fast_atoll64( &line[found - line + 1] ); line[found - line] = '\0'; // word #ifdef MAX_STR_LEN_L str_type fixword { { '\0', '\0', '\0', '\0', '\0', '\0', '\0' } }; ::memcpy( fixword.data(), line, found - line ); vec_ret.emplace_back( -count, fixword ); #else vec_ret.emplace_back( -count, line ); #endif } ::fclose(fh); } double elaspe_time( std::chrono::high_resolution_clock::time_point cend, std::chrono::high_resolution_clock::time_point cstart) { return double( std::chrono::duration_cast(cend - cstart).count() ) * 1e-6; } // --------------------------------------------------------------------- int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: llil5vec-tbb file1 file2 ... >out.txt\n"; return 1; } #ifdef MAX_STR_LEN_L std::cerr << "llil5vec-tbb (fixed string length=" << MAX_STR_LEN_L << ") start\n"; #else std::cerr << "llil5vec-tbb start\n"; #endif #ifdef USE_TBB_L std::cerr << "use TBB\n"; #else std::cerr << "don't use TBB\n"; #endif #if USE_BOOST_PARALLEL_SORT == 0 std::cerr << "don't use boost sort\n"; #else std::cerr << "use boost sort\n"; #endif std::chrono::high_resolution_clock::time_point cstart1, cend1, cstart2, cend2, cstart3, cend3s, cend3; cstart1 = std::chrono::high_resolution_clock::now(); #ifdef USE_TBB_L // Determine the number of threads. const char* env_nthrs = std::getenv("NUM_THREADS"); int nthrs = (env_nthrs && strlen(env_nthrs)) ? ::atoi(env_nthrs) : std::thread::hardware_concurrency(); tbb::global_control global_limit(tbb::global_control::max_allowed_parallelism, nthrs); #else int nthrs = 1; #endif // Get the list of input files from the command line int nfiles = argc - 1; char** fname = &argv[1]; // Create the vector of properties vec_int_str_type propvec; // propvec.reserve(N_LINES_BIG1_L * 3); // doesn't make much difference // Run parallel, depending on the number of threads if ( nthrs == 1 ) { for (int i = 0; i < nfiles; ++i) get_properties( fname[i], propvec ); } #ifdef USE_TBB_L else { tbb::parallel_for( tbb::blocked_range(0, nfiles, 1), [&](tbb::blocked_range r) { for (int i = r.begin(); i < r.end(); ++i) { vec_int_str_type locvec; get_properties( fname[i], locvec ); // A static mutex that is shared across all threads static tbb::spin_mutex mtx; // Acquire a scoped lock tbb::spin_mutex::scoped_lock lock(mtx); // Append local vector to propvec propvec.insert( propvec.end(), locvec.begin(), locvec.end() ); } }); } #endif cend1 = std::chrono::high_resolution_clock::now(); double ctaken1 = elaspe_time(cend1, cstart1); std::cerr << "get properties time : " << ctaken1 << " secs\n"; cstart2 = std::chrono::high_resolution_clock::now(); // Needs to be sorted by word for later sum of adjacent count fields to work #if USE_BOOST_PARALLEL_SORT == 0 #ifdef USE_TBB_L tbb::parallel_sort( #else std::sort( #endif propvec.begin(), propvec.end(), [](const int_str_type& left, const int_str_type& right) { return left.second < right.second; } ); #else // Try building with clang++ if g++ emits errors boost::sort::block_indirect_sort( propvec.begin(), propvec.end(), [](const int_str_type& left, const int_str_type& right) { return left.second < right.second; }, nthrs ); #endif cend2 = std::chrono::high_resolution_clock::now(); double ctaken2 = elaspe_time(cend2, cstart2); std::cerr << "sort properties time : " << ctaken2 << " secs\n"; cstart3 = std::chrono::high_resolution_clock::now(); // To sort, replace building a std::set with sorting a std::vector // Note: negative count gives desired ordering // Aside: consider how this loop might be parallelised vec_int_str_type myvec; auto it = propvec.cbegin(); str_type name_last = it->second; llil_int_type count = it->first; for (++it; it != propvec.cend(); ++it) { if ( it->second == name_last ) { count += it->first; } else { myvec.emplace_back( count, name_last ); name_last = it->second; count = it->first; } } myvec.emplace_back( count, name_last ); // As usual, stable_sort with a simpler sort function tends to be slower #if USE_BOOST_PARALLEL_SORT == 0 #ifdef USE_TBB_L tbb::parallel_sort( #else std::sort( #endif myvec.begin(), myvec.end(), [](const int_str_type& left, const int_str_type& right) { return left.first != right.first ? left.first < right.first : left.second < right.second; } // use this one for std::stable_sort // [](const int_str_type& left, const int_str_type& right) { return left.first < right.first; } ); #else // block_indirect_sort seems to be the fastest // Note: spinsort takes only 3 parameters (no nthrs parameter, unlike the other two) boost::sort::block_indirect_sort( // boost::sort::parallel_stable_sort( // boost::sort::spinsort( myvec.begin(), myvec.end(), [](const int_str_type& left, const int_str_type& right) { return left.first != right.first ? left.first < right.first : left.second < right.second; } // [](const int_str_type& left, const int_str_type& right) { return left.first < right.first; } , nthrs ); #endif cend3s = std::chrono::high_resolution_clock::now(); // Note: fix up negative count via -n.first #ifdef MAX_STR_LEN_L for ( auto const& n : myvec ) ::print(fast_io::concatln(std::string(n.second.data()), "\t", -n.first)); #else for ( auto const& n : myvec ) ::print(fast_io::concatln(n.second, "\t", -n.first)); #endif cend3 = std::chrono::high_resolution_clock::now(); double ctaken = elaspe_time(cend3, cstart1); double ctaken3s = elaspe_time(cend3s, cstart3); double ctaken3o = elaspe_time(cend3, cend3s); std::cerr << "vector stable sort time : " << ctaken3s << " secs\n"; std::cerr << "write stdout time : " << ctaken3o << " secs\n"; std::cerr << "total time : " << ctaken << " secs\n"; // Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly) // std::this_thread::sleep_for(std::chrono::milliseconds(90000000)); return 0; }