in reply to Re^7: Rosetta Code: Long List is Long (faster - llil4vec - OpenMP code)
in thread Rosetta Code: Long List is Long
Excellent work as always from marioroy. Much appreciated.
While struggling to learn OpenMP I stumbled upon Intel's OneAPI Threading Building Blocks (aka oneTBB). For some reason, I found this library easier to understand, so decided to give it a try. After downloading the oneapi-tbb-2021.7.0-lin.tgz release package from oneTBB 2021.7.0 and unpacking it under my Ubuntu $HOME dir and running:
to set the oneTBB variables, I was up and running. This is the command I used to compile C++ programs:. $HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/env/vars.sh
g++ -o llil3vec-tbb -std=c++20 -Wall -O3 -I "$HOME/local-oneapi-tbb/on +eapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021 +.7.0/lib" llil3vec-tbb.cpp -l tbb
Update: Much later, I hit a problem with locales:
Fixed crashing par1.cpp in pgatram dir by changing:$ locale -a C C.utf8 POSIX
in sample code from transform_reduce.// std::cout.imbue(std::locale{"en_US.UTF8"}); std::cout.imbue(std::locale{"C.utf8"});
What attracted to me to TBB was the ease of trying out updating a std::map from multiple threads with minimal changes simply by changing from:
to:using map_str_int_type = std::map<str_type, llil_int_type>;
using map_str_int_type = tbb::concurrent_map<str_type, llil_int_type>;
While that worked, it ran a little bit slower, presumably due to the locking overhead associated with updating the tbb::concurrent_map variable (hash_ret[word] -= count) from multiple threads. To avoid crashes, I further needed to break down the get_properties function so that each thread operated on a different input file (see get_properties_one_file function below).
I was able to get a minor speedup (similar to what I saw with OpenMP) by using this library without any locking on a vector, as shown in the sample code below:
// llil3vec-tbb.cpp. // Vector version of llil2grt.pl. // g++ compile on Linux: // g++ -o llil3vec-tbb -std=c++20 -Wall -O3 llil3vec-tbb.cpp // g++ -o llil3vec-tbb -std=c++20 -Wall -O3 -I "$HOME/local-oneapi- +tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tb +b-2021.7.0/lib" llil3vec-tbb.cpp -l tbb // This g++ command also works with mingw C++ compiler (https://source +forge.net/projects/mingw-w64) // that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.ex +e). // Example run: llil3vec-tbb big1.txt big2.txt big3.txt >vec.tmp // Uncomment to use Intel TBB library #define USE_TBB_L 1 // Uncomment to use TBB concurrent vector (best to avoid this -- too m +uch locking overhead) // #define USE_CONCURRENT_VECTOR_L 1 // Maximum allowed number of input files #define MAX_INPUT_FILES_L 64 #include <cstdio> #include <cstddef> #include <cstdint> #include <cstdlib> #include <cstring> #include <ctime> #include <string> #include <array> #include <vector> #include <set> #include <utility> #include <iterator> #include <algorithm> #include <execution> #ifdef USE_TBB_L #include <tbb/parallel_sort.h> #include <tbb/parallel_for.h> #ifdef USE_CONCURRENT_VECTOR_L #include <tbb/concurrent_vector.h> #endif #endif #include <iostream> #include <fstream> #include <sstream> static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, ne +ed a 64-bit compile"); // ------------------------------------------------------------------- +--------- // Crude hack to see Windows Private Bytes in Task Manager by sleeping + at // program end (see also sleep hack at end of main) // #include <chrono> // #include <thread> // ------------------------------------------------------------------- +--------- typedef long long llil_int_type; // Note: all words in big1.txt, big2.txt, big3.txt are <= 6 chars in l +ength // To use (limited length) fixed length strings uncomment the next lin +e // big.txt max word length is 6 // long.txt max word length is 208 // Based on rough benchmarking, this short fixed string hack // is only worth trying for MAX_STR_LEN_L up to about 64 #define MAX_STR_LEN_L 6 #ifdef MAX_STR_LEN_L using str_type = std::array<char, MAX_STR_LEN_L>; #else using str_type = std::string; #endif using str_int_type = std::pair<str_type, llil_int_type>; using int_str_type = std::pair<llil_int_type, str_type>; using vec_str_int_type = std::vector<str_int_type>; using set_int_str_type = std::set<int_str_type>; #ifdef USE_CONCURRENT_VECTOR_L using vec_int_str_type = tbb::concurrent_vector<int_str_type>; #else using vec_int_str_type = std::vector<int_str_type>; #endif // Mimic the Perl get_properties subroutine -------------------------- +-- // Limit line length and use ANSI C functions to try to boost performa +nce #define MAX_LINE_LEN_L 255 static void get_properties_one_file( const char* fname, // in: the input file name vec_int_str_type& vec_ret) // out: a vector of properties { FILE* fh; char line[MAX_LINE_LEN_L+1]; char* found; llil_int_type count; fh = ::fopen(fname, "r"); if (fh == NULL) { std::cerr << "Error opening '" << fname << "' : errno=" << errno + << "\n"; return; } while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) { found = ::strchr(line, '\t'); count = ::atoll( &line[found - line + 1] ); line[found - line] = '\0'; // word #ifdef MAX_STR_LEN_L str_type fixword { { '\0', '\0', '\0', '\0', '\0', '\0' } }; ::memcpy( fixword.data(), line, found - line ); vec_ret.emplace_back( -count, fixword ); #else vec_ret.emplace_back( -count, line ); #endif } ::fclose(fh); } static void get_properties( int nfiles, // in: the number of input files char* fname[], // in: the input file names vec_int_str_type& vec_ret) // out: a vector of properties { #ifdef USE_TBB_L #ifdef USE_CONCURRENT_VECTOR_L tbb::parallel_for( tbb::blocked_range<int>(0, nfiles), [&](tbb::blocked_range<int> r) { for (int i = r.begin(); i < r.end(); ++i) get_properties_one_file( fname[i], vec_ret ); }); #else // Use array of local vectors to parallelise without locking overhe +ad // of using tbb::concurrent_vector vec_int_str_type locvec[MAX_INPUT_FILES_L]; tbb::parallel_for( tbb::blocked_range<int>(0, nfiles), [&](tbb::blocked_range<int> r) { for (int i = r.begin(); i < r.end(); ++i) get_properties_one_file( fname[i], locvec[i] ); }); #endif #else // Not using TBB, just keep on appending to vec_ret for (int i = 0; i < nfiles; ++i) get_properties_one_file( fname[i], vec_ret ); #endif #ifdef USE_TBB_L #ifdef USE_CONCURRENT_VECTOR_L // nothing to do #else for (int i = 0; i < nfiles; ++i) vec_ret.insert( vec_ret.end(), locvec[i].begin(), locvec[i].end( +) ); #endif #endif // Needs to be sorted by word for later sum of adjacent count field +s to work #ifdef USE_TBB_L tbb::parallel_sort( #else std::sort( #endif vec_ret.begin(), vec_ret.end(), [](const int_str_type& left, const int_str_type& right) { return + left.second < right.second; } ); } // ------------------------------------------------------------------- +-- int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: llil3vec-tbb file1 file2 ... >out.txt\n"; return 1; } if (argc > MAX_INPUT_FILES_L) { std::cerr << "sorry too many input files, must be less than " << + MAX_INPUT_FILES_L << "\n"; return 1; } #ifdef MAX_STR_LEN_L std::cerr << "llil3vec-tbb (fixed string length=" << MAX_STR_LEN_L +<< ") start\n"; #else std::cerr << "llil3vec-tbb start\n"; #endif #ifdef USE_TBB_L std::cerr << "use TBB\n"; #ifdef USE_CONCURRENT_VECTOR_L std::cerr << "use concurrent vector\n"; #endif #else std::cerr << "don't use TBB\n"; #endif time_t tstart1 = ::time(NULL); clock_t cstart1 = ::clock(); // Create the vector of properties vec_int_str_type vec_ret; get_properties(argc - 1, &argv[1], vec_ret); clock_t cend1 = ::clock(); double ctaken1 = (double) (cend1 - cstart1) / (double)CLOCKS_PER_SE +C; std::cerr << "get_properties CPU time : " << ctaken1 << " secs +\n"; clock_t cstart2 = ::clock(); // To avoid calling sort(), create an inverted std::set container // Note: negative count gives desired ordering set_int_str_type myset; auto it = vec_ret.cbegin(); str_type name_last = it->second; llil_int_type count = it->first; for (++it; it != vec_ret.cend(); ++it) { if ( it->second == name_last ) { count += it->first; } else { myset.emplace_hint( myset.end(), count, name_last ); name_last = it->second; count = it->first; } } myset.emplace_hint( myset.end(), count, name_last ); clock_t cend2s = ::clock(); // Output the (already sorted) std::set - no sort() function requir +ed // Note: fix up negative count via -n.first #ifdef MAX_STR_LEN_L // If name is not NULL-terminated: for ( auto const& n : myset ) ::printf( "%.*s\t%lld\n", MAX_STR_LEN +_L, n.second.data(), -n.first ); // If name is NULL-terminated: // for ( auto const& n : myset ) ::printf( "%s\t%lld\n", n.second.d +ata(), -n.first ); // for ( auto const& n : myset ) std::cout << n.second.data() << '\ +t' << -n.first << '\n'; #else // Can try printf vs std::cout to see which is faster for ( auto const& n : myset ) ::printf( "%s\t%lld\n", n.second.c_st +r(), -n.first ); // for ( auto const& n : myset ) std::cout << n.second << '\t' << - +n.first << '\n'; #endif clock_t cend2 = ::clock(); time_t tend2 = ::time(NULL); long ttaken = static_cast<long>(::difftime(tend2, tstart1) + 0 +.5); double ctaken = (double) (cend2 - cstart1) / (double)CLOCKS_PER_ +SEC; double ctaken2s = (double) (cend2s - cstart2) / (double)CLOCKS_PER +_SEC; double ctaken2o = (double) (cend2 - cend2s) / (double)CLOCKS_PER +_SEC; std::cerr << "emplace set sort CPU time : " << ctaken2s << " sec +s\n"; std::cerr << "write stdout CPU time : " << ctaken2o << " sec +s\n"; std::cerr << "total CPU time : " << ctaken << " sec +s\n"; std::cerr << "total wall clock time : " << ttaken << " sec +s\n"; // Hack to see Private Bytes in Windows Task Manager (uncomment nex +t line so process doesn't exit too quickly) // std::this_thread::sleep_for(std::chrono::milliseconds(90000000 +)); return 0; }
Timings of OpenMP vs OneTbb on my machine
OpenMp version: $ time ./llil4vec_p big1.txt big2.txt big3.txt >vec.tmp llil2vec (fixed string length=6) start get_properties time : 0.853868 secs emplace set sort time : 0.972116 secs write stdout time : 0.875946 secs total time : 2.70229 secs real 0m3.041s user 0m5.553s sys 0m0.745s
OneTbb version: $ time ./llil3vec-tbb big1.txt big2.txt big3.txt >f.tmp llil3vec-tbb (fixed string length=6) start use TBB get_properties CPU time : 3.30477 secs emplace set sort CPU time : 0.825981 secs write stdout CPU time : 0.866964 secs total CPU time : 4.99808 secs total wall clock time : 3 secs real 0m2.890s user 0m4.687s sys 0m0.646s
The real time reported by the Linux time command when running the tbb version of 2.890s compares favourably with 3.041s of the OpenMP version.
Apart from performing better on modern CPU caches, std::vector seems to also outperform std::map in multi-threaded programs, due to the locking overhead of updating a global map from multiple threads.
Update: Timings for llil3vec-tbb-a.cpp below, built with clang++ and fast_io are slightly faster:
$ time ./llil3vec-tbb-a big1.txt big2.txt big3.txt >f.tmp llil3vec-tbb-a (fixed string length=6) start use TBB get_properties CPU time : 2.9722 secs emplace set sort CPU time : 0.61156 secs write stdout CPU time : 0.828023 secs total CPU time : 4.41257 secs total wall clock time : 3 secs real 0m2.552s user 0m4.304s sys 0m0.438s
Updated Simpler Version llil3vec-tbb-a.cpp
Update: built with:
clang++ -o llil3vec-tbb-a -std=c++20 -Wall -O3 -I "$HOME/llil/cmdlinux +/fast_io/include" -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/incl +ude" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" llil3vec-tbb +-a.cpp -l tbb
Oh, and see llil4vec-tbb.cpp in Re^9: Rosetta Code: Long List is Long (faster - llil4vec - TBB code) by marioroy for a cleaner way to merge the local array locvec[i] into vec_ret, via a scoped lock and a mutex, thus eliminating the ugly locvec[MAX_INPUT_FILES_L] array. Updated timings for llil4vec-tbb on my machine can be found here.
// llil3vec-tbb-a.cpp. // Vector version using the Intel TBB library // Note: TBB concurrent vector is best avoided, too much locking overh +ead // g++ compile on Linux: // g++ -o llil3vec-tbb-a -std=c++20 -Wall -O3 llil3vec-tbb-a.cpp // g++ -o llil3vec-tbb-a -std=c++20 -Wall -O3 -I "$HOME/local-oneap +i-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi- +tbb-2021.7.0/lib" llil3vec-tbb-a.cpp -l tbb // Seems to run slightly faster when compiled with clang++ instead of +g++ // This g++ command also works with mingw C++ compiler (https://source +forge.net/projects/mingw-w64) // that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.ex +e). // Example run: llil3vec-tbb-a big1.txt big2.txt big3.txt >vec.tmp // Uncomment to use Intel TBB library #define USE_TBB_L 1 // Maximum allowed number of input files #define MAX_INPUT_FILES_L 64 #include <cstdio> #include <cstddef> #include <cstdint> #include <cstdlib> #include <cstring> #include <ctime> #include <string> #include <array> #include <vector> #include <set> #include <utility> #include <iterator> #include <algorithm> #include <execution> #ifdef USE_TBB_L #include <tbb/parallel_sort.h> #include <tbb/parallel_for.h> #endif #include <iostream> #include <fstream> #include <sstream> static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, ne +ed a 64-bit compile"); // ------------------------------------------------------------------- +--------- // Crude hack to see Windows Private Bytes in Task Manager by sleeping + at // program end (see also sleep hack at end of main) // #include <chrono> // #include <thread> // ------------------------------------------------------------------- +--------- typedef long long llil_int_type; // Note: all words in big1.txt, big2.txt, big3.txt are <= 6 chars in l +ength // To use (limited length) fixed length strings uncomment the next lin +e // big.txt max word length is 6 // long.txt max word length is 208 // The standard big1.txt, big2.txt, big3.txt files all contain 3,515,2 +00 lines #define N_LINES_BIG1_L 3515200 // Based on rough benchmarking, this short fixed string hack // is only worth trying for MAX_STR_LEN_L up to about 64 #define MAX_STR_LEN_L 6 #ifdef MAX_STR_LEN_L using str_type = std::array<char, MAX_STR_LEN_L>; #else using str_type = std::string; #endif using str_int_type = std::pair<str_type, llil_int_type>; using int_str_type = std::pair<llil_int_type, str_type>; using vec_str_int_type = std::vector<str_int_type>; using vec_int_str_type = std::vector<int_str_type>; using set_int_str_type = std::set<int_str_type>; // Mimic the Perl get_properties subroutine -------------------------- +-- // Limit line length and use ANSI C functions to try to boost performa +nce #define MAX_LINE_LEN_L 255 static void get_properties( const char* fname, // in: the input file name vec_int_str_type& vec_ret) // out: a vector of properties { FILE* fh; char line[MAX_LINE_LEN_L+1]; char* found; llil_int_type count; fh = ::fopen(fname, "r"); if (fh == NULL) { std::cerr << "Error opening '" << fname << "' : errno=" << errno + << "\n"; return; } while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) { found = ::strchr(line, '\t'); count = ::atoll( &line[found - line + 1] ); line[found - line] = '\0'; // word #ifdef MAX_STR_LEN_L str_type fixword { { '\0', '\0', '\0', '\0', '\0', '\0' } }; ::memcpy( fixword.data(), line, found - line ); vec_ret.emplace_back( -count, fixword ); #else vec_ret.emplace_back( -count, line ); #endif } ::fclose(fh); } // ------------------------------------------------------------------- +-- int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: llil3vec-tbb-a file1 file2 ... >out.txt\n"; return 1; } if (argc > MAX_INPUT_FILES_L) { std::cerr << "sorry too many input files, must be less than " << + MAX_INPUT_FILES_L << "\n"; return 1; } #ifdef MAX_STR_LEN_L std::cerr << "llil3vec-tbb-a (fixed string length=" << MAX_STR_LEN_ +L << ") start\n"; #else std::cerr << "llil3vec-tbb-a start\n"; #endif #ifdef USE_TBB_L std::cerr << "use TBB\n"; #else std::cerr << "don't use TBB\n"; #endif time_t tstart1 = ::time(NULL); clock_t cstart1 = ::clock(); // Get the list of input files from the command line int nfiles = argc - 1; char** fname = &argv[1]; // Create the vector of properties vec_int_str_type propvec; // propvec.reserve(N_LINES_BIG1_L * 3); // doesn't make much dif +ference #ifdef USE_TBB_L // Use an array of local vectors to parallelise vec_int_str_type locvec[MAX_INPUT_FILES_L]; tbb::parallel_for( tbb::blocked_range<int>(0, nfiles), [&](tbb::blocked_range<int> r) { for (int i = r.begin(); i < r.end(); ++i) get_properties( fname[i], locvec[i] ); }); // Append local vectors to propvec // Aside: Try setting locvec[0] to propvec and use/append one less +local vector? for (int i = 0; i < nfiles; ++i) propvec.insert( propvec.end(), locvec[i].begin(), locvec[i].end( +) ); #else // Not using TBB: so just keep on appending to propvec for (int i = 0; i < nfiles; ++i) get_properties( fname[i], propvec ); #endif // Needs to be sorted by word for later sum of adjacent count field +s to work #ifdef USE_TBB_L tbb::parallel_sort( #else std::sort( #endif propvec.begin(), propvec.end(), [](const int_str_type& left, const int_str_type& right) { return + left.second < right.second; } ); clock_t cend1 = ::clock(); double ctaken1 = (double) (cend1 - cstart1) / (double)CLOCKS_PER_SE +C; std::cerr << "get_properties CPU time : " << ctaken1 << " secs +\n"; clock_t cstart2 = ::clock(); // To avoid calling sort(), create an inverted std::set container // Note: negative count gives desired ordering // Aside: try parallel std::reduce/std::accumulate here? (see also +[wp://MapReduce]) set_int_str_type myset; auto it = propvec.cbegin(); str_type name_last = it->second; llil_int_type count = it->first; for (++it; it != propvec.cend(); ++it) { if ( it->second == name_last ) { count += it->first; } else { myset.emplace_hint( myset.end(), count, name_last ); name_last = it->second; count = it->first; } } myset.emplace_hint( myset.end(), count, name_last ); clock_t cend2s = ::clock(); // Output the (already sorted) std::set - no sort() function requir +ed // Note: fix up negative count via -n.first #ifdef MAX_STR_LEN_L // If name is not NULL-terminated: for ( auto const& n : myset ) ::printf( "%.*s\t%lld\n", MAX_STR_LEN +_L, n.second.data(), -n.first ); // If name is NULL-terminated: // for ( auto const& n : myset ) ::printf( "%s\t%lld\n", n.second.d +ata(), -n.first ); // for ( auto const& n : myset ) std::cout << n.second.data() << '\ +t' << -n.first << '\n'; #else // Can try printf vs std::cout to see which is faster for ( auto const& n : myset ) ::printf( "%s\t%lld\n", n.second.c_st +r(), -n.first ); // for ( auto const& n : myset ) std::cout << n.second << '\t' << - +n.first << '\n'; #endif clock_t cend2 = ::clock(); time_t tend2 = ::time(NULL); long ttaken = static_cast<long>(::difftime(tend2, tstart1) + 0 +.5); double ctaken = (double) (cend2 - cstart1) / (double)CLOCKS_PER_ +SEC; double ctaken2s = (double) (cend2s - cstart2) / (double)CLOCKS_PER +_SEC; double ctaken2o = (double) (cend2 - cend2s) / (double)CLOCKS_PER +_SEC; std::cerr << "emplace set sort CPU time : " << ctaken2s << " sec +s\n"; std::cerr << "write stdout CPU time : " << ctaken2o << " sec +s\n"; std::cerr << "total CPU time : " << ctaken << " sec +s\n"; std::cerr << "total wall clock time : " << ttaken << " sec +s\n"; // Hack to see Private Bytes in Windows Task Manager (uncomment nex +t line so process doesn't exit too quickly) // std::this_thread::sleep_for(std::chrono::milliseconds(90000000 +)); return 0; }
Updated: Added simpler llil3vec-tbb-a.cpp version.
|
---|
Replies are listed 'Best First'. | |
---|---|
Re^9: Rosetta Code: Long List is Long (faster - vec - OpenMP code - tbb)
by marioroy (Prior) on Jan 17, 2023 at 10:33 UTC | |
Re^9: Rosetta Code: Long List is Long (faster - llil4vec - TBB code)
by marioroy (Prior) on Jan 19, 2023 at 15:42 UTC | |
by eyepopslikeamosquito (Archbishop) on Jan 22, 2023 at 08:05 UTC | |
by marioroy (Prior) on Jan 23, 2023 at 23:22 UTC |