$ time ./llil4map long* long* long* | cksum llil4map start use OpenMP ( old ) ( new ) memory consumption 36.6 GB 17.5 GB use boost sort get properties 12.967s 11.836s map to vector 3.244s 1.292s vector stable sort 10.265s 11.025s write stdout 1.410s 1.364s total time 27.887s 25.519s count lines 970195200 count unique 295755152 29612263 5038456270 real time 0m43.973s 0m27.461s user 25m08.532s 20m02.479s sys 0m55.602s 0m43.250s #### $ time ./llil4vec long* long* long* | cksum llil4vec start use OpenMP ( old ) (mem-fix) ( new ) memory consumption 89.6 GB 62.4 GB 33.0 GB use boost sort get properties 48.133s 32.659s 12.993s sort properties 39.684s 39.399s 33.421s vector reduce 87.982s 75.644s 35.568s vector stable sort 25.990s 31.133s 21.987s write stdout 3.707s 3.206s 3.693s total time 205.498s 182.042s 107.664s count lines 970195200 count unique 295755152 29612263 5038456270 real time 3m57.228s 3m38.118s 1m50.092s user 73m27.931s 70m13.776s 45m32.094s sys 2m42.941s 2m29.584s 2m20.917s #### #pragma omp parallel for schedule(static, 1) reduction(+:num_lines) for (int i = 0; i < nfiles; ++i) { vec_str_int_type locvec; num_lines += get_properties(fname[i], locvec); #pragma omp critical { // Append local vector to propvec (consumes more memory) // propvec.insert( // propvec.end(), // std::make_move_iterator(locvec.begin()), // std::make_move_iterator(locvec.end()) // ); // Append local vector to propvec (faster) auto it = locvec.begin(); auto it2 = locvec.end(); for (; it != it2; ++it) propvec.emplace_back(std::move(*it)); } } #### // Reduce a vector range (tally adjacent count fields of duplicate key names) static void reduce_vec(auto& vec) { if (vec.size() > 0) { auto it1 = vec.begin(); auto itr = it1; auto itw = it1; auto it2 = vec.end(); auto curr = itr; for (++itr; itr != it2; ++itr) { if (itr->first == curr->first) { curr->second += itr->second; } else { if (itw != curr) *itw = std::move(*curr); ++itw; curr = itr; } } if (itw != curr) *itw = std::move(*curr); vec.resize(std::distance(it1, ++itw)); } } #### string_cnt.h - original + key length stored with count string_cnt_f.h - fixed-length-only version string_cnt_u.h - union std::array #### // Free dynamically allocated memory in parallel. #ifndef FIXED_LENGTH_ONLY #pragma omp parallel for schedule(dynamic, 300000) num_threads(nthds) for (size_t i = 0; i < propvec.size(); i++) propvec[i].free(); #endif