$ NUM_THREADS=3 ./llil4vec big1.txt big2.txt big3.txt >out.txt llil4vec (fixed string length=12) start use OpenMP use boost sort get properties 0.131 secs sort properties 0.191 secs vector reduce 0.036 secs vector stable sort 0.267 secs write stdout 0.191 secs total time 0.818 secs ##
##

// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// llil4vec.cc
// A std::vector demonstration using OpenMP directives.
// https://perlmonks.com/?node_id=11149545
//
// April 25, 2024
//    Based on llil3m.cpp  https://perlmonks.com/?node_id=11149482
//    Original challenge   https://perlmonks.com/?node_id=11147822
//         and summary     https://perlmonks.com/?node_id=11150293
//    Other demonstrations https://perlmonks.com/?node_id=11149907
//
// Authors
//    Mario Roy - C++ demonstration with parallel capabilities
//    eyepopslikeamosquito - Co-author, learning C++ at PerlMonks.com
//
// See also, memory efficient variant
//    https://gist.github.com/marioroy/d02881b96b20fa1adde4388b3e216163
//
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// OpenMP Little Book - https://nanxiao.gitbooks.io/openmp-little-book
//
// Compile on Linux (clang++ or g++):
//    clang++ -o llil4vec -std=c++20 -fopenmp -Wall -O3 llil4vec.cc
//
// On macOS, use g++-12 from https://brew.sh (installation: brew install gcc@12).
// The g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64)
// that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe).
//
// Obtain gen-llil.pl and gen-long-llil.pl from https://perlmonks.com/?node_id=11148681
//    perl gen-llil.pl big1.txt 200 3 1
//    perl gen-llil.pl big2.txt 200 3 1
//    perl gen-llil.pl big3.txt 200 3 1
//
// To make random input, obtain shuffle.pl from https://perlmonks.com/?node_id=11149800
//    perl shuffle.pl big1.txt >tmp && mv tmp big1.txt
//    perl shuffle.pl big2.txt >tmp && mv tmp big2.txt
//    perl shuffle.pl big3.txt >tmp && mv tmp big3.txt
//
// Example run:  llil4vec big1.txt big2.txt big3.txt >out.txt
// NUM_THREADS=3 llil4vec ...
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#include 
#include 

#include 
#include 

#include 
#include 

#include 
#include 
#include 

static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile");

// Specify 0/1 to use boost's parallel sorting algorithm; faster than __gnu_parallel::sort.
// https://www.boost.org/doc/libs/1_85_0/libs/sort/doc/html/sort/parallel.html
// https://www.boost.org/doc/libs/1_85_0/libs/sort/doc/papers/block_indirect_sort_en.pdf
// This requires the boost header files: e.g. devpkg-boost bundle on Clear Linux.
// Note: Another option is downloading and unpacking Boost locally.
// (no need to build it because the bits we use are header file only)
#define USE_BOOST_PARALLEL_SORT 1

#if USE_BOOST_PARALLEL_SORT
   #ifdef __clang__
      #pragma clang diagnostic push
      #pragma clang diagnostic ignored "-Wunused-parameter"
      #pragma clang diagnostic ignored "-Wshadow"
      #include 
      #pragma clang diagnostic pop
   #else
      #include 
   #endif
#endif

#ifdef _OPENMP
   #include 
#endif

// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

typedef uint32_t int_type;

// All words in big1.txt, big2.txt, big3.txt are <= 6 chars in length.
// big.txt  max word length is 6
// long.txt max word length is 208
//
// Based on rough benchmarking, the short fixed string hack below is only
// worth trying for MAX_STR_LEN_L up to about 30.
// See also https://backlinko.com/google-keyword-study
//
// To use (limited length) fixed length strings uncomment the next line.
#define MAX_STR_LEN_L (size_t) 12

#ifdef MAX_STR_LEN_L
struct str_type : std::array {
   bool operator==( const str_type& o ) const {
      return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) == 0;
   }
   bool operator<( const str_type& o ) const {
      return ::memcmp(this->data(), o.data(), MAX_STR_LEN_L) < 0;
   }
};
#else
using str_type = std::basic_string;
#endif

using str_int_type     = std::pair;
using vec_str_int_type = std::vector;

// Mimic the Perl get_properties subroutine ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

// convert positive number from string to uint32_t
inline uint32_t fast_atoll64(const char* str)
{
   uint32_t val = 0;
   uint8_t digit;
   while ((digit = uint8_t(*str++ - '0')) <= 9)
      val = val * 10 + digit;
   return val;
}

// Helper function to find a character.
inline char* find_char(char* first, char* last, char c)
{
   while (first != last) {
      if (*first == c) break;
      ++first;
   }
   return first;
}

static int64_t get_properties(
   const char*        fname,       //  in: the input file name
   vec_str_int_type&  vec_ret)     // out: a vector of properties
{
   int64_t num_lines = 0;
   std::ifstream fin(fname, std::ios::binary);
   if (!fin.is_open()) {
      std::cerr << "Error opening '" << fname << "' : " << strerror(errno) << '\n';
      return num_lines;
   }
   fin.seekg(0, std::ios::end);         // Get the size of the file
   std::streampos fsize = fin.tellg();
   fin.seekg(0, std::ios::beg);

   std::vector buf(1 + fsize);    // Read the whole file into the buffer
   fin.read(&buf[0], fsize);
   fin.close();

   char* first = &buf[0];
   char* last = &buf[fsize];
                                        // Iterate through the buffer
   while (first < last) {
      char* beg_ptr{first};
      char* end_ptr{find_char(first, last, '\n')};
      char* found = find_char(beg_ptr, end_ptr, '\t');

      first = end_ptr + 1;
      if (found == end_ptr)
         continue;

      assert(*found == '\t');
      int_type count = fast_atoll64(found + 1);
      size_t klen = found - beg_ptr;

#ifdef MAX_STR_LEN_L
      str_type s {};  // {} initializes all elements of s to '\0'
      ::memcpy(s.data(), beg_ptr, std::min(MAX_STR_LEN_L, klen));
#else
      str_type s(beg_ptr, klen);
#endif
      vec_ret.emplace_back(std::move(s), count);

      ++num_lines;
   }

   return num_lines;
}

// Output subroutine ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

inline constexpr size_t CHUNK_SIZE = 32768;

size_t divide_up(size_t dividend, size_t divisor)
{
   if (dividend % divisor)
      return (size_t)(dividend / divisor) + 1;
   else
      return (size_t)(dividend / divisor);
}           

static void out_properties(
   const int          nthds,     // in   : the number of threads
   vec_str_int_type&  vec)       // in   : the vector to output
{
   size_t num_chunks = divide_up(vec.size(), CHUNK_SIZE);
   int nthds_out = 1;
#ifdef _OPENMP
   nthds_out = std::min(nthds, 32);
#endif

   #pragma omp parallel for ordered schedule(static, 1) num_threads(nthds_out)
   for (size_t chunk_id = 1; chunk_id <= num_chunks; ++chunk_id) {
      std::string str(""); str.reserve(2048 * 1024);
      auto it  = vec.begin() + (chunk_id - 1) * CHUNK_SIZE;
      auto it2 = vec.begin() + std::min(vec.size(), chunk_id * CHUNK_SIZE);

      for (; it != it2; ++it) {
#ifdef MAX_STR_LEN_L
         str.append(it->first.data());
#else
         str.append(it->first.data(), it->first.size());
#endif
         str.append("\t", 1);
         str.append(std::to_string(it->second));
         str.append("\n", 1);
      }

      #pragma omp ordered
      std::cout << str << std::flush;
   }
}

// Reduce a vector range (tally adjacent count fields of duplicate key names)
static void reduce_vec(
   auto& vec    // vector elements to reduce
)
{
   if (vec.size() > 0) {
      auto it1 = vec.begin(); auto itr = it1; auto itw = it1;
      auto it2 = vec.end();
      auto curr = itr;

      for (++itr; itr != it2; ++itr) {
         if (itr->first == curr->first) {
            curr->second += itr->second;
         }
         else {
            if (itw != curr)
               *itw = std::move(*curr);
            ++itw;
            curr = itr;
         }
      }

      if (itw != curr)
         *itw = std::move(*curr);

      vec.resize(std::distance(it1, ++itw));
   }
}

typedef std::chrono::high_resolution_clock high_resolution_clock;
typedef std::chrono::high_resolution_clock::time_point time_point;
typedef std::chrono::milliseconds milliseconds;

double elaspe_time(time_point cend, time_point cstart) {
   return double (
      std::chrono::duration_cast(cend - cstart).count()
   ) * 1e-3;
}

// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

int main(int argc, char* argv[])
{
   if (argc < 2) {
      if (argc > 0)
         std::cerr << "usage: llil4vec file1 file2 ... >out.txt\n";
      return 1;
   }

   std::cerr << std::setprecision(3) << std::setiosflags(std::ios::fixed);
#ifdef MAX_STR_LEN_L
   std::cerr << "llil4vec (fixed string length=" << MAX_STR_LEN_L << ") start\n";
#else
   std::cerr << "llil4vec start\n";
#endif
#ifdef _OPENMP
   std::cerr << "use OpenMP\n";
#else
   std::cerr << "don't use OpenMP\n";
#endif
#if USE_BOOST_PARALLEL_SORT == 0
   std::cerr << "don't use boost sort\n";
#else
   std::cerr << "use boost sort\n";
#endif
   time_point cstart1, cend1, cstart2, cend2, cstart3, cend3r, cend3s, cend3;
   cstart1 = high_resolution_clock::now();

#ifdef _OPENMP
   // Determine the number of threads.
   const char* env_nthds = std::getenv("NUM_THREADS");
   int nthds = ( env_nthds && strlen(env_nthds) )
      ? ::atoi(env_nthds)
      : std::thread::hardware_concurrency();
   omp_set_dynamic(false);
   omp_set_num_threads(nthds);
#else
   int nthds = 1;
#endif

   // Get the list of input files from the command line
   int    nfiles = argc - 1;
   char** fname  = &argv[1];

   // Create the vector of properties
   vec_str_int_type propvec;
   int64_t num_lines = 0;

   // Run parallel, depending on the number of threads
   if (nthds == 1 || nfiles == 1) {
      for (int i = 0; i < nfiles; ++i)
         num_lines += get_properties(fname[i], propvec);
   }
#ifdef _OPENMP
   else {
      #pragma omp parallel for schedule(static, 1) reduction(+:num_lines)
      for (int i = 0; i < nfiles; ++i) {
         vec_str_int_type locvec;
         num_lines += get_properties(fname[i], locvec);
         #pragma omp critical
         {
            // Append local vector to propvec (consumes more memory)
            // propvec.insert(
            //    propvec.end(),
            //    std::make_move_iterator(locvec.begin()),
            //    std::make_move_iterator(locvec.end())
            // );

            // Append local vector to propvec (faster)
            auto it  = locvec.begin();
            auto it2 = locvec.end();
            for (; it != it2; ++it)
               propvec.emplace_back(std::move(*it));
         }
      }
   }
#endif

   cend1 = high_resolution_clock::now();
   double ctaken1 = elaspe_time(cend1, cstart1);
   std::cerr << "get properties      " << std::setw(8) << ctaken1 << " secs\n";

   if (!propvec.size()) {
      std::cerr << "No work, exiting...\n";
      return 1;
   }

   cstart2 = high_resolution_clock::now();

   // Needs to be sorted by word for later sum of adjacent count fields to work
#if USE_BOOST_PARALLEL_SORT == 0
   std::sort(
      propvec.begin(), propvec.end(),
      [](const str_int_type& left, const str_int_type& right) {
         return left.first < right.first;
      }
   );
#else
   boost::sort::block_indirect_sort(
      propvec.begin(), propvec.end(),
      [](const str_int_type& left, const str_int_type& right) {
         return left.first < right.first;
      },
      std::min(nthds, 32)
   );
#endif

   cend2 = high_resolution_clock::now();
   double ctaken2 = elaspe_time(cend2, cstart2);
   std::cerr << "sort properties     " << std::setw(8) << ctaken2 << " secs\n";

   cstart3 = high_resolution_clock::now();

   // Reduce in-place (tally adjacent count fields of duplicate key names)
   reduce_vec(propvec);

   cend3r = high_resolution_clock::now();

   // Sort the vector by (count) in reverse order, (name) in lexical order
   auto reverse_order = [](const str_int_type& left, const str_int_type& right) {
      return left.second != right.second
         ? left.second > right.second
         : left.first  < right.first;
   };
#if USE_BOOST_PARALLEL_SORT == 0
   // Standard sort
   std::sort(propvec.begin(), propvec.end(), reverse_order);
#else
   // Parallel sort
   boost::sort::block_indirect_sort(
      propvec.begin(), propvec.end(), reverse_order,
      std::min(nthds, 32)
   );
#endif

   cend3s = high_resolution_clock::now();

   // Output the sorted vector
   out_properties(nthds, propvec);
   cend3 = high_resolution_clock::now();

   double ctaken   = elaspe_time(cend3, cstart1);
   double ctaken3r = elaspe_time(cend3r, cstart3);
   double ctaken3s = elaspe_time(cend3s, cend3r);
   double ctaken3o = elaspe_time(cend3, cend3s);

   std::cerr << "vector reduce       " << std::setw(8) << ctaken3r << " secs\n";
   std::cerr << "vector stable sort  " << std::setw(8) << ctaken3s << " secs\n";
   std::cerr << "write stdout        " << std::setw(8) << ctaken3o << " secs\n";
   std::cerr << "total time          " << std::setw(8) << ctaken   << " secs\n";
   std::cerr << "    count lines     " << num_lines << "\n";
   std::cerr << "    count unique    " << propvec.size() << "\n";

   // Hack to see Private Bytes in Windows Task Manager
   // (uncomment next line so process doesn't exit too quickly)
   // std::this_thread::sleep_for(milliseconds(9000));

   return 0;
}