. $HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/env/vars.sh #### g++ -o llil3vec-tbb -std=c++20 -Wall -O3 -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" llil3vec-tbb.cpp -l tbb #### $ locale -a C C.utf8 POSIX #### // std::cout.imbue(std::locale{"en_US.UTF8"}); std::cout.imbue(std::locale{"C.utf8"}); #### using map_str_int_type = std::map; #### using map_str_int_type = tbb::concurrent_map; #### // llil3vec-tbb.cpp. // Vector version of llil2grt.pl. // g++ compile on Linux: // g++ -o llil3vec-tbb -std=c++20 -Wall -O3 llil3vec-tbb.cpp // g++ -o llil3vec-tbb -std=c++20 -Wall -O3 -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" llil3vec-tbb.cpp -l tbb // This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64) // that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe). // Example run: llil3vec-tbb big1.txt big2.txt big3.txt >vec.tmp // Uncomment to use Intel TBB library #define USE_TBB_L 1 // Uncomment to use TBB concurrent vector (best to avoid this -- too much locking overhead) // #define USE_CONCURRENT_VECTOR_L 1 // Maximum allowed number of input files #define MAX_INPUT_FILES_L 64 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef USE_TBB_L #include #include #ifdef USE_CONCURRENT_VECTOR_L #include #endif #endif #include #include #include static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile"); // ---------------------------------------------------------------------------- // Crude hack to see Windows Private Bytes in Task Manager by sleeping at // program end (see also sleep hack at end of main) // #include // #include // ---------------------------------------------------------------------------- typedef long long llil_int_type; // Note: all words in big1.txt, big2.txt, big3.txt are <= 6 chars in length // To use (limited length) fixed length strings uncomment the next line // big.txt max word length is 6 // long.txt max word length is 208 // Based on rough benchmarking, this short fixed string hack // is only worth trying for MAX_STR_LEN_L up to about 64 #define MAX_STR_LEN_L 6 #ifdef MAX_STR_LEN_L using str_type = std::array; #else using str_type = std::string; #endif using str_int_type = std::pair; using int_str_type = std::pair; using vec_str_int_type = std::vector; using set_int_str_type = std::set; #ifdef USE_CONCURRENT_VECTOR_L using vec_int_str_type = tbb::concurrent_vector; #else using vec_int_str_type = std::vector; #endif // Mimic the Perl get_properties subroutine ---------------------------- // Limit line length and use ANSI C functions to try to boost performance #define MAX_LINE_LEN_L 255 static void get_properties_one_file( const char* fname, // in: the input file name vec_int_str_type& vec_ret) // out: a vector of properties { FILE* fh; char line[MAX_LINE_LEN_L+1]; char* found; llil_int_type count; fh = ::fopen(fname, "r"); if (fh == NULL) { std::cerr << "Error opening '" << fname << "' : errno=" << errno << "\n"; return; } while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) { found = ::strchr(line, '\t'); count = ::atoll( &line[found - line + 1] ); line[found - line] = '\0'; // word #ifdef MAX_STR_LEN_L str_type fixword { { '\0', '\0', '\0', '\0', '\0', '\0' } }; ::memcpy( fixword.data(), line, found - line ); vec_ret.emplace_back( -count, fixword ); #else vec_ret.emplace_back( -count, line ); #endif } ::fclose(fh); } static void get_properties( int nfiles, // in: the number of input files char* fname[], // in: the input file names vec_int_str_type& vec_ret) // out: a vector of properties { #ifdef USE_TBB_L #ifdef USE_CONCURRENT_VECTOR_L tbb::parallel_for( tbb::blocked_range(0, nfiles), [&](tbb::blocked_range r) { for (int i = r.begin(); i < r.end(); ++i) get_properties_one_file( fname[i], vec_ret ); }); #else // Use array of local vectors to parallelise without locking overhead // of using tbb::concurrent_vector vec_int_str_type locvec[MAX_INPUT_FILES_L]; tbb::parallel_for( tbb::blocked_range(0, nfiles), [&](tbb::blocked_range r) { for (int i = r.begin(); i < r.end(); ++i) get_properties_one_file( fname[i], locvec[i] ); }); #endif #else // Not using TBB, just keep on appending to vec_ret for (int i = 0; i < nfiles; ++i) get_properties_one_file( fname[i], vec_ret ); #endif #ifdef USE_TBB_L #ifdef USE_CONCURRENT_VECTOR_L // nothing to do #else for (int i = 0; i < nfiles; ++i) vec_ret.insert( vec_ret.end(), locvec[i].begin(), locvec[i].end() ); #endif #endif // Needs to be sorted by word for later sum of adjacent count fields to work #ifdef USE_TBB_L tbb::parallel_sort( #else std::sort( #endif vec_ret.begin(), vec_ret.end(), [](const int_str_type& left, const int_str_type& right) { return left.second < right.second; } ); } // --------------------------------------------------------------------- int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: llil3vec-tbb file1 file2 ... >out.txt\n"; return 1; } if (argc > MAX_INPUT_FILES_L) { std::cerr << "sorry too many input files, must be less than " << MAX_INPUT_FILES_L << "\n"; return 1; } #ifdef MAX_STR_LEN_L std::cerr << "llil3vec-tbb (fixed string length=" << MAX_STR_LEN_L << ") start\n"; #else std::cerr << "llil3vec-tbb start\n"; #endif #ifdef USE_TBB_L std::cerr << "use TBB\n"; #ifdef USE_CONCURRENT_VECTOR_L std::cerr << "use concurrent vector\n"; #endif #else std::cerr << "don't use TBB\n"; #endif time_t tstart1 = ::time(NULL); clock_t cstart1 = ::clock(); // Create the vector of properties vec_int_str_type vec_ret; get_properties(argc - 1, &argv[1], vec_ret); clock_t cend1 = ::clock(); double ctaken1 = (double) (cend1 - cstart1) / (double)CLOCKS_PER_SEC; std::cerr << "get_properties CPU time : " << ctaken1 << " secs\n"; clock_t cstart2 = ::clock(); // To avoid calling sort(), create an inverted std::set container // Note: negative count gives desired ordering set_int_str_type myset; auto it = vec_ret.cbegin(); str_type name_last = it->second; llil_int_type count = it->first; for (++it; it != vec_ret.cend(); ++it) { if ( it->second == name_last ) { count += it->first; } else { myset.emplace_hint( myset.end(), count, name_last ); name_last = it->second; count = it->first; } } myset.emplace_hint( myset.end(), count, name_last ); clock_t cend2s = ::clock(); // Output the (already sorted) std::set - no sort() function required // Note: fix up negative count via -n.first #ifdef MAX_STR_LEN_L // If name is not NULL-terminated: for ( auto const& n : myset ) ::printf( "%.*s\t%lld\n", MAX_STR_LEN_L, n.second.data(), -n.first ); // If name is NULL-terminated: // for ( auto const& n : myset ) ::printf( "%s\t%lld\n", n.second.data(), -n.first ); // for ( auto const& n : myset ) std::cout << n.second.data() << '\t' << -n.first << '\n'; #else // Can try printf vs std::cout to see which is faster for ( auto const& n : myset ) ::printf( "%s\t%lld\n", n.second.c_str(), -n.first ); // for ( auto const& n : myset ) std::cout << n.second << '\t' << -n.first << '\n'; #endif clock_t cend2 = ::clock(); time_t tend2 = ::time(NULL); long ttaken = static_cast(::difftime(tend2, tstart1) + 0.5); double ctaken = (double) (cend2 - cstart1) / (double)CLOCKS_PER_SEC; double ctaken2s = (double) (cend2s - cstart2) / (double)CLOCKS_PER_SEC; double ctaken2o = (double) (cend2 - cend2s) / (double)CLOCKS_PER_SEC; std::cerr << "emplace set sort CPU time : " << ctaken2s << " secs\n"; std::cerr << "write stdout CPU time : " << ctaken2o << " secs\n"; std::cerr << "total CPU time : " << ctaken << " secs\n"; std::cerr << "total wall clock time : " << ttaken << " secs\n"; // Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly) // std::this_thread::sleep_for(std::chrono::milliseconds(90000000)); return 0; } #### OpenMp version: $ time ./llil4vec_p big1.txt big2.txt big3.txt >vec.tmp llil2vec (fixed string length=6) start get_properties time : 0.853868 secs emplace set sort time : 0.972116 secs write stdout time : 0.875946 secs total time : 2.70229 secs real 0m3.041s user 0m5.553s sys 0m0.745s #### OneTbb version: $ time ./llil3vec-tbb big1.txt big2.txt big3.txt >f.tmp llil3vec-tbb (fixed string length=6) start use TBB get_properties CPU time : 3.30477 secs emplace set sort CPU time : 0.825981 secs write stdout CPU time : 0.866964 secs total CPU time : 4.99808 secs total wall clock time : 3 secs real 0m2.890s user 0m4.687s sys 0m0.646s #### $ time ./llil3vec-tbb-a big1.txt big2.txt big3.txt >f.tmp llil3vec-tbb-a (fixed string length=6) start use TBB get_properties CPU time : 2.9722 secs emplace set sort CPU time : 0.61156 secs write stdout CPU time : 0.828023 secs total CPU time : 4.41257 secs total wall clock time : 3 secs real 0m2.552s user 0m4.304s sys 0m0.438s #### clang++ -o llil3vec-tbb-a -std=c++20 -Wall -O3 -I "$HOME/llil/cmdlinux/fast_io/include" -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" llil3vec-tbb-a.cpp -l tbb #### // llil3vec-tbb-a.cpp. // Vector version using the Intel TBB library // Note: TBB concurrent vector is best avoided, too much locking overhead // g++ compile on Linux: // g++ -o llil3vec-tbb-a -std=c++20 -Wall -O3 llil3vec-tbb-a.cpp // g++ -o llil3vec-tbb-a -std=c++20 -Wall -O3 -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" llil3vec-tbb-a.cpp -l tbb // Seems to run slightly faster when compiled with clang++ instead of g++ // This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64) // that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe). // Example run: llil3vec-tbb-a big1.txt big2.txt big3.txt >vec.tmp // Uncomment to use Intel TBB library #define USE_TBB_L 1 // Maximum allowed number of input files #define MAX_INPUT_FILES_L 64 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef USE_TBB_L #include #include #endif #include #include #include static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile"); // ---------------------------------------------------------------------------- // Crude hack to see Windows Private Bytes in Task Manager by sleeping at // program end (see also sleep hack at end of main) // #include // #include // ---------------------------------------------------------------------------- typedef long long llil_int_type; // Note: all words in big1.txt, big2.txt, big3.txt are <= 6 chars in length // To use (limited length) fixed length strings uncomment the next line // big.txt max word length is 6 // long.txt max word length is 208 // The standard big1.txt, big2.txt, big3.txt files all contain 3,515,200 lines #define N_LINES_BIG1_L 3515200 // Based on rough benchmarking, this short fixed string hack // is only worth trying for MAX_STR_LEN_L up to about 64 #define MAX_STR_LEN_L 6 #ifdef MAX_STR_LEN_L using str_type = std::array; #else using str_type = std::string; #endif using str_int_type = std::pair; using int_str_type = std::pair; using vec_str_int_type = std::vector; using vec_int_str_type = std::vector; using set_int_str_type = std::set; // Mimic the Perl get_properties subroutine ---------------------------- // Limit line length and use ANSI C functions to try to boost performance #define MAX_LINE_LEN_L 255 static void get_properties( const char* fname, // in: the input file name vec_int_str_type& vec_ret) // out: a vector of properties { FILE* fh; char line[MAX_LINE_LEN_L+1]; char* found; llil_int_type count; fh = ::fopen(fname, "r"); if (fh == NULL) { std::cerr << "Error opening '" << fname << "' : errno=" << errno << "\n"; return; } while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) { found = ::strchr(line, '\t'); count = ::atoll( &line[found - line + 1] ); line[found - line] = '\0'; // word #ifdef MAX_STR_LEN_L str_type fixword { { '\0', '\0', '\0', '\0', '\0', '\0' } }; ::memcpy( fixword.data(), line, found - line ); vec_ret.emplace_back( -count, fixword ); #else vec_ret.emplace_back( -count, line ); #endif } ::fclose(fh); } // --------------------------------------------------------------------- int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: llil3vec-tbb-a file1 file2 ... >out.txt\n"; return 1; } if (argc > MAX_INPUT_FILES_L) { std::cerr << "sorry too many input files, must be less than " << MAX_INPUT_FILES_L << "\n"; return 1; } #ifdef MAX_STR_LEN_L std::cerr << "llil3vec-tbb-a (fixed string length=" << MAX_STR_LEN_L << ") start\n"; #else std::cerr << "llil3vec-tbb-a start\n"; #endif #ifdef USE_TBB_L std::cerr << "use TBB\n"; #else std::cerr << "don't use TBB\n"; #endif time_t tstart1 = ::time(NULL); clock_t cstart1 = ::clock(); // Get the list of input files from the command line int nfiles = argc - 1; char** fname = &argv[1]; // Create the vector of properties vec_int_str_type propvec; // propvec.reserve(N_LINES_BIG1_L * 3); // doesn't make much difference #ifdef USE_TBB_L // Use an array of local vectors to parallelise vec_int_str_type locvec[MAX_INPUT_FILES_L]; tbb::parallel_for( tbb::blocked_range(0, nfiles), [&](tbb::blocked_range r) { for (int i = r.begin(); i < r.end(); ++i) get_properties( fname[i], locvec[i] ); }); // Append local vectors to propvec // Aside: Try setting locvec[0] to propvec and use/append one less local vector? for (int i = 0; i < nfiles; ++i) propvec.insert( propvec.end(), locvec[i].begin(), locvec[i].end() ); #else // Not using TBB: so just keep on appending to propvec for (int i = 0; i < nfiles; ++i) get_properties( fname[i], propvec ); #endif // Needs to be sorted by word for later sum of adjacent count fields to work #ifdef USE_TBB_L tbb::parallel_sort( #else std::sort( #endif propvec.begin(), propvec.end(), [](const int_str_type& left, const int_str_type& right) { return left.second < right.second; } ); clock_t cend1 = ::clock(); double ctaken1 = (double) (cend1 - cstart1) / (double)CLOCKS_PER_SEC; std::cerr << "get_properties CPU time : " << ctaken1 << " secs\n"; clock_t cstart2 = ::clock(); // To avoid calling sort(), create an inverted std::set container // Note: negative count gives desired ordering // Aside: try parallel std::reduce/std::accumulate here? (see also [wp://MapReduce]) set_int_str_type myset; auto it = propvec.cbegin(); str_type name_last = it->second; llil_int_type count = it->first; for (++it; it != propvec.cend(); ++it) { if ( it->second == name_last ) { count += it->first; } else { myset.emplace_hint( myset.end(), count, name_last ); name_last = it->second; count = it->first; } } myset.emplace_hint( myset.end(), count, name_last ); clock_t cend2s = ::clock(); // Output the (already sorted) std::set - no sort() function required // Note: fix up negative count via -n.first #ifdef MAX_STR_LEN_L // If name is not NULL-terminated: for ( auto const& n : myset ) ::printf( "%.*s\t%lld\n", MAX_STR_LEN_L, n.second.data(), -n.first ); // If name is NULL-terminated: // for ( auto const& n : myset ) ::printf( "%s\t%lld\n", n.second.data(), -n.first ); // for ( auto const& n : myset ) std::cout << n.second.data() << '\t' << -n.first << '\n'; #else // Can try printf vs std::cout to see which is faster for ( auto const& n : myset ) ::printf( "%s\t%lld\n", n.second.c_str(), -n.first ); // for ( auto const& n : myset ) std::cout << n.second << '\t' << -n.first << '\n'; #endif clock_t cend2 = ::clock(); time_t tend2 = ::time(NULL); long ttaken = static_cast(::difftime(tend2, tstart1) + 0.5); double ctaken = (double) (cend2 - cstart1) / (double)CLOCKS_PER_SEC; double ctaken2s = (double) (cend2s - cstart2) / (double)CLOCKS_PER_SEC; double ctaken2o = (double) (cend2 - cend2s) / (double)CLOCKS_PER_SEC; std::cerr << "emplace set sort CPU time : " << ctaken2s << " secs\n"; std::cerr << "write stdout CPU time : " << ctaken2o << " secs\n"; std::cerr << "total CPU time : " << ctaken << " secs\n"; std::cerr << "total wall clock time : " << ttaken << " secs\n"; // Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly) // std::this_thread::sleep_for(std::chrono::milliseconds(90000000)); return 0; }