cd $HOME/local-boost wget https://boostorg.jfrog.io/artifactory/main/release/1.81.0/source/boost_1_81_0_rc1.tar.gz.json wget https://boostorg.jfrog.io/artifactory/main/release/1.81.0/source/boost_1_81_0_rc1.tar.gz tar -xzf boost_1_81_0_rc1.tar.gz #### clang++ -o llil5vec-tbb -std=c++20 -Wall -O3 -I "$HOME/llil/cmdlinux/fast_io/include" -I "$HOME/local-boost/boost_1_81_0" -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" llil5vec-tbb.cpp -l tbb #### block_indirect_sort.hpp:274:23: warning: implicit capture of ‘this’ via ‘[=]’ is deprecated in C++20 [-Wdeprecated] #### $ NUM_THREADS=6 ./llil4vec-tbb big1.txt big2.txt big3.txt >f.tmp llil4vec-tbb (fixed string length=6) start use TBB get properties time : 0.399307 secs sort properties time : 0.277303 secs emplace set sort time : 0.764016 secs write stdout time : 0.51597 secs total time : 1.95691 secs #### $ NUM_THREADS=6 ./llil5vec-tbb big1.txt big2.txt big3.txt >big-5vec.tmp llil5vec-tbb (fixed string length=6) start use TBB use boost sort get properties time : 0.367148 secs sort properties time : 0.274697 secs vector stable sort time : 0.385357 secs write stdout time : 0.390569 secs total time : 1.4179 secs $ diff big-5vec.tmp big-3vec.tmp #### $ NUM_THREADS=6 ./llil4vec2 big1.txt big2.txt big3.txt >f.tmp llil4vec2 (fixed string length=6) start use OpenMP use boost sort get properties time : 0.353 secs sort properties time : 0.271 secs vector reduce time : 0.074 secs vector stable sort time : 0.197 secs write stdout time : 0.322 secs total time : 1.219 secs #### $ NUM_THREADS=6 ./llil4vec2 big1.txt big2.txt big3.txt >f.tmp llil4vec2 (fixed string length=6) start use OpenMP use boost sort get properties time : 0.321 secs sort properties time : 0.268 secs vector reduce time : 0.078 secs vector stable sort time : 0.194 secs write stdout time : 0.329 secs total time : 1.192 secs #### $ NUM_THREADS=6 ./llil4vec2 big1.txt big2.txt big3.txt >f.tmp llil4vec2 (fixed string length=6) use memcmp start use OpenMP use boost sort get properties time : 0.311 secs sort properties time : 0.261 secs vector reduce time : 0.030 secs vector stable sort time : 0.199 secs write stdout time : 0.316 secs total time : 1.120 secs #### $ LD_PRELOAD=/usr/local/lib/libmimalloc.so NUM_THREADS=6 ./llil4vec-new big1.txt big2.txt big3.txt >f.tmp llil4vec (fixed string length=6) start use OpenMP use boost sort nthrs=6 get properties 0.192 secs sort properties 0.261 secs vector reduce 0.033 secs vector stable sort 0.186 secs write stdout 0.311 secs total time 0.986 secs $ LD_PRELOAD=/usr/local/lib/libmimalloc.so NUM_THREADS=6 ./llil4vec-new big1.txt big2.txt big3.txt >f.tmp llil4vec (fixed string length=6) start use OpenMP use boost sort nthrs=6 get properties 0.201 secs sort properties 0.257 secs vector reduce 0.032 secs vector stable sort 0.194 secs write stdout 0.300 secs total time 0.986 secs $ cmp f.tmp big-3vec.tmp #### $ NUM_THREADS=6 ./llil4map big1.txt big2.txt big3.txt >f.tmp llil4map start use phmap::parallel_flat_hash_map use OpenMP use boost sort get properties time : 1.6778 secs finish merging time : 0.754921 secs vector stable sort time : 0.76791 secs write stdout time : 0.382316 secs total time : 3.5832 secs #### $ NUM_THREADS=6 LD_LIBRARY_PATH=/usr/local/lib ./llil4judy big1.txt big2.txt big3.txt >f.tmp llil4judy (fixed string length=6) start use OpenMP use boost sort get properties 0.682 secs finish merging 2.338 secs vector stable sort 0.262 secs write stdout 0.526 secs total time 3.810 secs $ cmp f.tmp big-3vec.tmp #### $ NUM_THREADS=6 ./llil4vec2 big?.txt big?.txt big?.txt >f4vec.tmp llil4vec2 (fixed string length=6) start use OpenMP use boost sort get properties time : 1.9719 secs sort properties time : 2.27273 secs vector reduce time : 0.201546 secs vector stable sort time : 0.500317 secs write stdout time : 0.721466 secs total time : 5.66814 secs $ NUM_THREADS=6 ./llil4map big?.txt big?.txt big?.txt >f4map.tmp llil4map start use phmap::parallel_flat_hash_map use OpenMP use boost sort get properties time : 5.33992 secs finish merging time : 2.20465 secs vector stable sort time : 1.60465 secs write stdout time : 1.21258 secs total time : 10.3622 secs $ diff f4map.tmp f4vec.tmp $ ls -l f4map.tmp f4vec.tmp -rw-r--r-- 183490874 Feb 1 09:38 f4map.tmp -rw-r--r-- 183490874 Feb 1 09:37 f4vec.tmp $ ls -l big*.txt -rw-r--r-- 31636800 Jan 16 18:26 big1.txt -rw-r--r-- 31636800 Jan 16 18:26 big2.txt -rw-r--r-- 31636800 Jan 16 18:26 big3.txt -rw-r--r-- 31636800 Feb 1 09:33 big4.txt -rw-r--r-- 31636800 Feb 1 09:31 big5.txt -rw-r--r-- 31636800 Feb 1 09:31 big6.txt #### Update: a later version with improved get_properties(): $ NUM_THREADS=6 ./llil4vec2 big?.txt big?.txt big?.txt big?.txt big?.txt big?.txt >f4vec.tmp llil4vec2 (fixed string length=6) start use OpenMP use boost sort get properties time : 4.392 secs sort properties time : 4.706 secs vector reduce time : 0.225 secs vector stable sort time : 0.491 secs write stdout time : 0.766 secs total time : 10.593 secs $ NUM_THREADS=6 ./llil4map big?.txt big?.txt big?.txt big?.txt big?.txt big?.txt >f4map.tmp llil4map start use phmap::parallel_flat_hash_map use OpenMP use boost sort get properties time : 10.6884 secs finish merging time : 2.11571 secs vector stable sort time : 1.83454 secs write stdout time : 2.55906 secs total time : 17.198 secs $ diff f4map.tmp f4vec.tmp #### // llil5vec-tbb.cpp // Based on llil4vec-tbb.cpp in https://perlmonks.com/?node_id=11149687 // // Vector version using the Intel TBB library // Note: TBB concurrent vector is best avoided, too much locking overhead // based on llil3vec-tbb-a.cpp https://perlmonks.com/?node_id=11149622 // 1. Capture time and diff via chrono. // 2. Key words null-terminated for MAX_STR_LEN_L. // 3. Concat strings using fast_io::concatln during output. // 4. Support NUM_THREADS environment variable. // 5. Merge local vector inside the parallel_for block. // This allows threads, completing get_properties, to merge immediately. // 6. Moved vec_loc instantiation inside for loop. // 7. Support running one thread, writing to propvec (no merging). // 8. Capture time for get and sort properties separately. // 9. Added define for using boost's parallel sort; requires clang++. // // To obtain the fast_io library (required dependency): // git clone --depth=1 https://github.com/cppfastio/fast_io // // Compiled on Linux with: // clang++ -o llil5vec-tbb -std=c++20 -Wall -O3 -I "$HOME/local-fast_io/fast_io/include" -I "$HOME/local-boost/boost_1_81_0" -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" llil5vec-tbb.cpp -l tbb // Also works with g++ but throws some compiler warnings about deprecated C++20 features. // Seems to run slightly faster when compiled with clang++ rather than g++ // Uncomment to use Intel TBB library #define USE_TBB_L 1 // Specify 0/1 to use boost's parallel sorting algorithm; faster than __gnu_parallel::sort. // https://www.boost.org/doc/libs/1_81_0/libs/sort/doc/html/sort/parallel.html // This requires the boost header files: e.g. devpkg-boost bundle on Clear Linux. // Note: I was able to build this program just by downloading and unpacking Boost locally // (no need to build it because the bits we use are header file only) #define USE_BOOST_PARALLEL_SORT 1 #include #include // The fast_io header must come after chrono, else build error: // "no member named 'concatln' in namespace 'fast_io'" #include #include #include #include #include #include #include #include #include #include #include #include #if USE_BOOST_PARALLEL_SORT > 0 #include #else #endif #include #include #ifdef USE_TBB_L #include #include #include #include #endif #include #include #include static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile"); // ---------------------------------------------------------------------------- // Crude hack to see Windows Private Bytes in Task Manager by sleeping at // program end (see also sleep hack at end of main) // #include // #include // ---------------------------------------------------------------------------- typedef long long llil_int_type; // Note: all words in big1.txt, big2.txt, big3.txt are <= 6 chars in length // To use (limited length) fixed length strings uncomment the next line // big.txt max word length is 6 // long.txt max word length is 208 // The standard big1.txt, big2.txt, big3.txt files all contain 3,515,200 lines #define N_LINES_BIG1_L 3515200 // Based on rough benchmarking, the short fixed string hack below is only // worth trying for MAX_STR_LEN_L up to about 22. // See also https://backlinko.com/google-keyword-study #define MAX_STR_LEN_L 6 #ifdef MAX_STR_LEN_L using str_type = std::array; #else using str_type = std::string; #endif using str_int_type = std::pair; using int_str_type = std::pair; using vec_str_int_type = std::vector; using vec_int_str_type = std::vector; // fast_atoll64 -------------------------------------------------------- // // https://stackoverflow.com/questions/16826422/ // c-most-efficient-way-to-convert-string-to-int-faster-than-atoi inline int64_t fast_atoll64( const char* str ) { int64_t val = 0; int sign = 0; if ( *str == '-' ) { sign = 1, ++str; } uint8_t digit; while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit; return sign ? -val : val; } // Mimic the Perl get_properties subroutine ---------------------------- // Limit line length and use ANSI C functions to try to boost performance #define MAX_LINE_LEN_L 255 static void get_properties( const char* fname, // in: the input file name vec_int_str_type& vec_ret) // out: a vector of properties { FILE* fh; char line[MAX_LINE_LEN_L+1]; char* found; llil_int_type count; fh = ::fopen(fname, "r"); if (fh == NULL) { std::cerr << "Error opening '" << fname << "' : errno=" << errno << "\n"; return; } while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) { found = ::strchr(line, '\t'); // fast_atoll64 is a touch faster than ::atoll count = fast_atoll64( &line[found - line + 1] ); line[found - line] = '\0'; // word #ifdef MAX_STR_LEN_L str_type fixword { { '\0', '\0', '\0', '\0', '\0', '\0', '\0' } }; ::memcpy( fixword.data(), line, found - line ); vec_ret.emplace_back( -count, fixword ); #else vec_ret.emplace_back( -count, line ); #endif } ::fclose(fh); } double elaspe_time( std::chrono::high_resolution_clock::time_point cend, std::chrono::high_resolution_clock::time_point cstart) { return double( std::chrono::duration_cast(cend - cstart).count() ) * 1e-6; } // --------------------------------------------------------------------- int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: llil5vec-tbb file1 file2 ... >out.txt\n"; return 1; } #ifdef MAX_STR_LEN_L std::cerr << "llil5vec-tbb (fixed string length=" << MAX_STR_LEN_L << ") start\n"; #else std::cerr << "llil5vec-tbb start\n"; #endif #ifdef USE_TBB_L std::cerr << "use TBB\n"; #else std::cerr << "don't use TBB\n"; #endif #if USE_BOOST_PARALLEL_SORT == 0 std::cerr << "don't use boost sort\n"; #else std::cerr << "use boost sort\n"; #endif std::chrono::high_resolution_clock::time_point cstart1, cend1, cstart2, cend2, cstart3, cend3s, cend3; cstart1 = std::chrono::high_resolution_clock::now(); #ifdef USE_TBB_L // Determine the number of threads. const char* env_nthrs = std::getenv("NUM_THREADS"); int nthrs = (env_nthrs && strlen(env_nthrs)) ? ::atoi(env_nthrs) : std::thread::hardware_concurrency(); tbb::global_control global_limit(tbb::global_control::max_allowed_parallelism, nthrs); #else int nthrs = 1; #endif // Get the list of input files from the command line int nfiles = argc - 1; char** fname = &argv[1]; // Create the vector of properties vec_int_str_type propvec; // propvec.reserve(N_LINES_BIG1_L * 3); // doesn't make much difference // Run parallel, depending on the number of threads if ( nthrs == 1 ) { for (int i = 0; i < nfiles; ++i) get_properties( fname[i], propvec ); } #ifdef USE_TBB_L else { tbb::parallel_for( tbb::blocked_range(0, nfiles, 1), [&](tbb::blocked_range r) { for (int i = r.begin(); i < r.end(); ++i) { vec_int_str_type locvec; get_properties( fname[i], locvec ); // A static mutex that is shared across all threads static tbb::spin_mutex mtx; // Acquire a scoped lock tbb::spin_mutex::scoped_lock lock(mtx); // Append local vector to propvec propvec.insert( propvec.end(), locvec.begin(), locvec.end() ); } }); } #endif cend1 = std::chrono::high_resolution_clock::now(); double ctaken1 = elaspe_time(cend1, cstart1); std::cerr << "get properties time : " << ctaken1 << " secs\n"; cstart2 = std::chrono::high_resolution_clock::now(); // Needs to be sorted by word for later sum of adjacent count fields to work #if USE_BOOST_PARALLEL_SORT == 0 #ifdef USE_TBB_L tbb::parallel_sort( #else std::sort( #endif propvec.begin(), propvec.end(), [](const int_str_type& left, const int_str_type& right) { return left.second < right.second; } ); #else // Try building with clang++ if g++ emits errors boost::sort::block_indirect_sort( propvec.begin(), propvec.end(), [](const int_str_type& left, const int_str_type& right) { return left.second < right.second; }, nthrs ); #endif cend2 = std::chrono::high_resolution_clock::now(); double ctaken2 = elaspe_time(cend2, cstart2); std::cerr << "sort properties time : " << ctaken2 << " secs\n"; cstart3 = std::chrono::high_resolution_clock::now(); // To sort, replace building a std::set with sorting a std::vector // Note: negative count gives desired ordering // Aside: consider how this loop might be parallelised vec_int_str_type myvec; auto it = propvec.cbegin(); str_type name_last = it->second; llil_int_type count = it->first; for (++it; it != propvec.cend(); ++it) { if ( it->second == name_last ) { count += it->first; } else { myvec.emplace_back( count, name_last ); name_last = it->second; count = it->first; } } myvec.emplace_back( count, name_last ); // As usual, stable_sort with a simpler sort function tends to be slower #if USE_BOOST_PARALLEL_SORT == 0 #ifdef USE_TBB_L tbb::parallel_sort( #else std::sort( #endif myvec.begin(), myvec.end(), [](const int_str_type& left, const int_str_type& right) { return left.first != right.first ? left.first < right.first : left.second < right.second; } // use this one for std::stable_sort // [](const int_str_type& left, const int_str_type& right) { return left.first < right.first; } ); #else // block_indirect_sort seems to be the fastest // Note: spinsort takes only 3 parameters (no nthrs parameter, unlike the other two) boost::sort::block_indirect_sort( // boost::sort::parallel_stable_sort( // boost::sort::spinsort( myvec.begin(), myvec.end(), [](const int_str_type& left, const int_str_type& right) { return left.first != right.first ? left.first < right.first : left.second < right.second; } // [](const int_str_type& left, const int_str_type& right) { return left.first < right.first; } , nthrs ); #endif cend3s = std::chrono::high_resolution_clock::now(); // Note: fix up negative count via -n.first #ifdef MAX_STR_LEN_L for ( auto const& n : myvec ) ::print(fast_io::concatln(std::string(n.second.data()), "\t", -n.first)); #else for ( auto const& n : myvec ) ::print(fast_io::concatln(n.second, "\t", -n.first)); #endif cend3 = std::chrono::high_resolution_clock::now(); double ctaken = elaspe_time(cend3, cstart1); double ctaken3s = elaspe_time(cend3s, cstart3); double ctaken3o = elaspe_time(cend3, cend3s); std::cerr << "vector stable sort time : " << ctaken3s << " secs\n"; std::cerr << "write stdout time : " << ctaken3o << " secs\n"; std::cerr << "total time : " << ctaken << " secs\n"; // Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly) // std::this_thread::sleep_for(std::chrono::milliseconds(90000000)); return 0; } #### // llil4vec2.cpp // See also: perlmonks.com, node_id=11149754 // llil4vec.cpp with a reduce_vec function. // Based on: https://perlmonks.com/?node_id=11149545 // OpenMP Little Book - https://nanxiao.gitbooks.io/openmp-little-book/content/ // // Vector version of llil2grt.pl. // based on llil3vec.cpp https://perlmonks.com/?node_id=11149482 // 1. Run get_properties in parallel. // 2. Capture time and diff via chrono. // 3. Threads: flush local vector periodically. // 4. Key words null-terminated for MAX_STR_LEN_L. // 5. Concat strings using fast_io::concatln during output. // 6. Support NUM_THREADS environment variable. // 7. Add FLUSH_VECTOR_PERIODICALLY define statement. // 8. Removed periodically flush, not what I expected. // 9. Simplified code, similar to the tbb implementation. // A. Capture time for get and sort properties separately. // B. Added define for using boost's parallel sort. // C. Replaced atoll with fast_atoll64. // D. Fast vector sorting - from llil5vec-tbb. // E. Reduce in-place, duplicate key names - tally count. // F. Exit early if no work; fast_io tweak writing to output; // fixword: ensure not more than MAX_LINE_LEN_L characters; // limit to 8 threads max for sorting. // G. Capture time for vector reduce separately. // I. Improved get_properties; set precision for timings. // // Obtain the fast_io library (required dependency): // git clone --depth=1 https://github.com/cppfastio/fast_io // // g++ compile on Linux: (boost header may need the -Wno-stringop-overflow gcc option) // g++ -o llil4vec2 -std=c++20 -Wall -O3 llil4vec2.cpp -I ./fast_io/include // g++ -o llil4vec2-omp -std=c++20 -fopenmp -Wall -O3 llil4vec2.cpp -I ./fast_io/include // // This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64) // that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe). // // clang++ compile: same args, without the -Wno-stringop-overflow option // Seems to run slightly faster when compiled with clang++ instead of g++ // // An example compile on Ubuntu Linux with some 3rd party (header) libraries unpacked to $HOME: // clang++ -o llil4vec2 -std=c++20 -fopenmp -Wall -O3 // -I "$HOME/local-fast_io/fast_io/include" // -I "$HOME/local-parallel-hashmap/parallel-hashmap" // -I "$HOME/local-boost/boost_1_81_0" // llil4vec2.cpp // // Obtain gen-llil.pl and gen-long-llil.pl from https://perlmonks.com/?node_id=11148681 // perl gen-llil.pl big1.txt 200 3 1 // perl gen-llil.pl big2.txt 200 3 1 // perl gen-llil.pl big3.txt 200 3 1 // perl gen-long-llil.pl long1.txt 600 // perl gen-long-llil.pl long2.txt 600 // perl gen-long-llil.pl long3.txt 600 // // To make random input, obtain shuffle.pl from https://perlmonks.com/?node_id=11149800 // perl shuffle.pl big1.txt >tmp && mv tmp big1.txt // perl shuffle.pl big2.txt >tmp && mv tmp big2.txt // perl shuffle.pl big3.txt >tmp && mv tmp big3.txt // // Example run: llil4vec2 big1.txt big2.txt big3.txt >out.txt // NUM_THREADS=3 llil4vec2-omp ... // ---------------------------------------------------------------------------- // Specify 0/1 to use boost's parallel sorting algorithm; faster than __gnu_parallel::sort. // https://www.boost.org/doc/libs/1_81_0/libs/sort/doc/html/sort/parallel.html // This requires the boost header files: e.g. devpkg-boost bundle on Clear Linux. // Note: Another option is downloading and unpacking Boost locally. // (no need to build it because the bits we use are header file only) #define USE_BOOST_PARALLEL_SORT 1 #include #include // The fast_io header must come after chrono, else build error: // "no member named 'concatln' in namespace 'fast_io'" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if USE_BOOST_PARALLEL_SORT > 0 #include #endif #ifdef _OPENMP #include #endif #include #include #include #include static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile"); // ---------------------------------------------------------------------------- typedef long long llil_int_type; // Note: all words in big1.txt, big2.txt, big3.txt are <= 6 chars in length // big.txt max word length is 6 // long.txt max word length is 208 // Based on rough benchmarking, the short fixed string hack below is only // worth trying for MAX_STR_LEN_L up to about 22. // See also https://backlinko.com/google-keyword-study // Note: if input data contains words longer than MAX_STR_LEN_L // the program may malfunction or even crash // To use (limited length) fixed length strings uncomment the next line #define MAX_STR_LEN_L 6 #ifdef MAX_STR_LEN_L // Uncomment next line to use C memcmp function to compare fixed length strings #define USE_MEMCMP_L 1 // using str_type = std::array; using str_type = std::array; #else // using str_type = std::string; using str_type = std::basic_string; #endif using int_str_type = std::pair; using vec_int_str_type = std::vector; // fast_atoll64 // https://stackoverflow.com/questions/16826422/ // c-most-efficient-way-to-convert-string-to-int-faster-than-atoi // Commenting out done below because llil spec [id://11148465] states: // each line must match : ^[a-z]+\t\d+$ // i.e. you may assume count >=0 inline int64_t fast_atoll64( const char* str ) { int64_t val = 0; // int sign = 0; // if ( *str == '-' ) { // sign = 1, ++str; // } uint8_t digit; while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit; // return sign ? -val : val; return val; } // Mimic the Perl get_properties subroutine ---------------------------- // Limit line length and use ANSI C functions to try to boost performance #define MAX_LINE_LEN_L 255 static void get_properties( const char* fname, // in: the input file name vec_int_str_type& vec_ret) // out: a vector of properties { FILE* fh; std::array line; char* found; llil_int_type count; fh = ::fopen(fname, "r"); if ( fh == NULL ) { std::cerr << "Error opening '" << fname << "' : " << strerror(errno) << "\n"; return; } while ( ::fgets( line.data(), static_cast(MAX_LINE_LEN_L), fh ) != NULL ) { found = std::find( line.begin(), line.end(), '\t' ); count = fast_atoll64(found+1); #ifdef MAX_STR_LEN_L str_type fixword {}; // Note: {} initializes all elements of fixword to '\0' std::copy( line.begin(), found, fixword.begin() ); vec_ret.emplace_back( count, fixword ); #else *found = '\0'; vec_ret.emplace_back( count, line.data() ); #endif } ::fclose(fh); } // Reduce a vector range (tally adjacent count fields of duplicate key names) // Return the reduced length static vec_int_str_type::size_type reduce_vec( vec_int_str_type::iterator it1, // range of vector elements to reduce vec_int_str_type::iterator it2 ) { auto itr = it1; auto itw = it1; llil_int_type count = itr->first; str_type name_last = itr->second; for ( ++itr; itr != it2; ++itr ) { #ifdef USE_MEMCMP_L if ( ::memcmp(itr->second.data(), name_last.data(), MAX_STR_LEN_L) == 0 ) { #else if ( itr->second == name_last ) { #endif count += itr->first; } else { itw->first = count; itw->second = name_last; ++itw; count = itr->first; name_last = itr->second; } } itw->first = count; itw->second = name_last; return std::distance(it1, ++itw); } typedef std::chrono::high_resolution_clock high_resolution_clock; typedef std::chrono::high_resolution_clock::time_point time_point; typedef std::chrono::milliseconds milliseconds; double elaspe_time( time_point cend, time_point cstart) { return double ( std::chrono::duration_cast(cend - cstart).count() ) * 1e-3; } // --------------------------------------------------------------------- int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: llil4vec2 file1 file2 ... >out.txt\n"; return 1; } std::cerr << std::setprecision(3) << std::setiosflags(std::ios::fixed); #ifdef MAX_STR_LEN_L #ifdef USE_MEMCMP_L std::cerr << "llil4vec2 (fixed string length=" << MAX_STR_LEN_L << ") use memcmp start\n"; #else std::cerr << "llil4vec2 (fixed string length=" << MAX_STR_LEN_L << ") start\n"; #endif #else std::cerr << "llil4vec2 start\n"; #endif #ifdef _OPENMP std::cerr << "use OpenMP\n"; #else std::cerr << "don't use OpenMP\n"; #endif #if USE_BOOST_PARALLEL_SORT == 0 std::cerr << "don't use boost sort\n"; #else std::cerr << "use boost sort\n"; #endif time_point cstart1, cend1, cstart2, cend2, cstart3, cend3r, cend3s, cend3; cstart1 = std::chrono::high_resolution_clock::now(); #ifdef _OPENMP // Determine the number of threads. const char* env_nthrs = std::getenv("NUM_THREADS"); int nthrs = (env_nthrs && strlen(env_nthrs)) ? ::atoi(env_nthrs) : std::thread::hardware_concurrency(); omp_set_dynamic(false); omp_set_num_threads(nthrs); #else int nthrs = 1; #endif int nthrs_sort = ( std::thread::hardware_concurrency() < 12 ) ? std::thread::hardware_concurrency() : 12; // Get the list of input files from the command line int nfiles = argc - 1; char** fname = &argv[1]; // Create the vector of properties vec_int_str_type propvec; // Run parallel, depending on the number of threads if ( nthrs == 1 || nfiles == 1 ) { for (int i = 0; i < nfiles; ++i) get_properties( fname[i], propvec ); } #ifdef _OPENMP else { #pragma omp parallel for schedule(static, 1) for (int i = 0; i < nfiles; ++i) { vec_int_str_type locvec; get_properties( fname[i], locvec ); #pragma omp critical { // Append local vector to propvec propvec.insert( propvec.end(), locvec.begin(), locvec.end() ); } } } #endif if (!propvec.size()) { std::cerr << "No work, exiting...\n"; return 1; } cend1 = std::chrono::high_resolution_clock::now(); double ctaken1 = elaspe_time(cend1, cstart1); std::cerr << "get properties time : " << std::setw(8) << ctaken1 << " secs\n"; cstart2 = std::chrono::high_resolution_clock::now(); // Needs to be sorted by word for later sum of adjacent count fields to work #if USE_BOOST_PARALLEL_SORT == 0 std::sort( propvec.begin(), propvec.end(), [](const int_str_type& left, const int_str_type& right) { return #ifdef USE_MEMCMP_L ::memcmp(left.second.data(), right.second.data(), MAX_STR_LEN_L) < 0; #else left.second < right.second; #endif } ); #else boost::sort::block_indirect_sort( propvec.begin(), propvec.end(), [](const int_str_type& left, const int_str_type& right) { return #ifdef USE_MEMCMP_L ::memcmp(left.second.data(), right.second.data(), MAX_STR_LEN_L) < 0; #else left.second < right.second; #endif }, nthrs_sort ); #endif cend2 = std::chrono::high_resolution_clock::now(); double ctaken2 = elaspe_time(cend2, cstart2); std::cerr << "sort properties time : " << std::setw(8) << ctaken2 << " secs\n"; cstart3 = std::chrono::high_resolution_clock::now(); // Reduce in-place (tally adjacent count fields of duplicate key names) vec_int_str_type::size_type newsize = reduce_vec( propvec.begin(), propvec.end() ); propvec.resize(newsize); cend3r = std::chrono::high_resolution_clock::now(); // Sort the vector by (count) in reverse order, (name) in lexical order #if USE_BOOST_PARALLEL_SORT == 0 std::sort( // Standard sort propvec.begin(), propvec.end(), [](const int_str_type& left, const int_str_type& right) { #ifdef USE_MEMCMP_L return left.first != right.first ? left.first > right.first : ::memcmp(left.second.data(), right.second.data(), MAX_STR_LEN_L) < 0; #else return left.first != right.first ? left.first > right.first : left.second < right.second; #endif } ); #else boost::sort::block_indirect_sort( // Parallel sort propvec.begin(), propvec.end(), [](const int_str_type& left, const int_str_type& right) { #ifdef USE_MEMCMP_L return left.first != right.first ? left.first > right.first : ::memcmp(left.second.data(), right.second.data(), MAX_STR_LEN_L) < 0; #else return left.first != right.first ? left.first > right.first : left.second < right.second; #endif }, nthrs_sort ); #endif cend3s = std::chrono::high_resolution_clock::now(); // Output the sorted vector for ( auto const& n : propvec ) { #ifdef MAX_STR_LEN_L // Note: finding doco on mnp::os_c_str() is a challenge, but testing shows this // prints non null-terminated strings of length MAX_STR_LEN_L correctly // ::print(fast_io::concatln(fast_io::mnp::os_c_str(n.second.data(), MAX_STR_LEN_L), "\t", n.first)); // ::println(fast_io::mnp::os_c_str(n.second.data(), MAX_STR_LEN_L), "\t", n.first); fast_io::io::println(fast_io::mnp::os_c_str(n.second.data(), MAX_STR_LEN_L), "\t", n.first); #else // ::print(fast_io::concatln(n.second, "\t", n.first)); // ::println(n.second, "\t", n.first); fast_io::io::println(n.second, "\t", n.first); #endif } cend3 = std::chrono::high_resolution_clock::now(); double ctaken = elaspe_time(cend3, cstart1); double ctaken3r = elaspe_time(cend3r, cstart3); double ctaken3s = elaspe_time(cend3s, cend3r); double ctaken3o = elaspe_time(cend3, cend3s); std::cerr << "vector reduce time : " << std::setw(8) << ctaken3r << " secs\n"; std::cerr << "vector stable sort time : " << std::setw(8) << ctaken3s << " secs\n"; std::cerr << "write stdout time : " << std::setw(8) << ctaken3o << " secs\n"; std::cerr << "total time : " << std::setw(8) << ctaken << " secs\n"; // Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly) // std::this_thread::sleep_for(std::chrono::milliseconds(90000000)); return 0; }