#!/usr/bin/env perl use strict; use warnings; use List::Util 'shuffle'; my @arr = shuffle <>; print @arr; ##

##

./shuffle.pl big1.txt >tmp && mv tmp big1.txt
./shuffle.pl big2.txt >tmp && mv tmp big2.txt
./shuffle.pl big3.txt >tmp && mv tmp big3.txt

./shuffle.pl long1.txt >tmp && mv tmp long1.txt
./shuffle.pl long2.txt >tmp && mv tmp long2.txt
./shuffle.pl long3.txt >tmp && mv tmp long3.txt

##

##

$ NUM_THREADS=1 ./llil4judy-no6 big{1,2,3}.txt long{1,2,3}.txt >out.txt
llil4judy start
use OpenMP
use boost sort
get properties         2.336 secs
judy to vector         0.252 secs
vector stable sort     0.566 secs
write stdout           0.220 secs
total time             3.375 secs

$ NUM_THREADS=1 ./llil4map-no6-flat big{1,2,3}.txt long{1,2,3}.txt >out.txt
llil4map start
use OpenMP
use boost sort
get properties         1.847 secs
map to vector          0.198 secs
vector stable sort     0.513 secs
write stdout           0.224 secs
total time             2.784 secs


$ NUM_THREADS=1 ./llil4vec-no6 big{1,2,3}.txt long{1,2,3}.txt >out.txt
llil4vec start
use OpenMP
use boost sort
get properties         0.510 secs
sort properties        1.046 secs
vector reduce          0.266 secs
vector stable sort     1.125 secs
write stdout           0.283 secs
total time             3.233 secs

##

##

~/j904/bin/jconsole llil5.ijs big{1,2,3}.txt long{1,2,3}.txt out.txt

Read and parse input:    2.44621
Classify, sum, sort:     4.82435
Format and write output: 2.05906
Total time:              9.32962

##

##

// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// llil4judy.cc
// A Judy SL demonstration.
// https://www.perlmonks.org/?node_id=11149800
//
// April 25, 2024
//    Based on llil3m.cpp  https://perlmonks.com/?node_id=11149482
//    Original challenge   https://perlmonks.com/?node_id=11147822
//         and summary     https://perlmonks.com/?node_id=11150293
//    Other demonstrations https://perlmonks.com/?node_id=11149907
//
// Authors
//    Mario Roy - C++ demonstration with parallel capabilities
//    eyepopslikeamosquito - Co-author, learning C++ at PerlMonks.com
//
// See also, memory efficient variant
//    https://gist.github.com/marioroy/d02881b96b20fa1adde4388b3e216163
//
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// OpenMP Little Book - https://nanxiao.gitbooks.io/openmp-little-book
//          
// This requires the boost library: boost::static_strings::basic_static_string.
//
// Obtain, build, and install the Judy array library.
//    I downloaded two patch files from the gentoo site.
//       https://judy.sourceforge.net/index.html
//       https://gitweb.gentoo.org/repo/gentoo.git/tree/dev-libs/judy
//       https://gitweb.gentoo.org/repo/gentoo.git/tree/dev-libs/judy/files
//
//    tar xf ~/Downloads/Judy-1.0.5.tar.gz
//    cd judy-1.0.5
//
//    patch -p0 < ~/Downloads/judy-1.0.5-parallel-make.patch
//    patch -p1 < ~/Downloads/judy-1.0.5-gcc49.patch
//
//    sed -i 's/automake-1.9/automake/' bootstrap
//    sed -i 's/AM_CONFIG_HEADER/AC_CONFIG_HEADERS/g' configure.ac
//
//    ./configure --enable-64-bit
//
//    # Omit passing the -j option to make. It may fail due to parallel build.
//    make
//    sudo make install
//    sudo rm /usr/local/lib/libJudy.la
//    sudo ldconfig
//
// Compile on Linux (clang++ or g++):
//    clang++ -o llil4judy -std=c++20 -fopenmp -Wall -O3 llil4judy.cc -I/usr/local/include -L/usr/local/lib -lJudy
//
// The g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64)
// that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe).
//
// Obtain gen-llil.pl and gen-long-llil.pl from https://perlmonks.com/?node_id=11148681
//    perl gen-llil.pl big1.txt 200 3 1
//    perl gen-llil.pl big2.txt 200 3 1
//    perl gen-llil.pl big3.txt 200 3 1
//
// To make random input, obtain shuffle.pl from https://perlmonks.com/?node_id=11149800
//    perl shuffle.pl big1.txt >tmp && mv tmp big1.txt
//    perl shuffle.pl big2.txt >tmp && mv tmp big2.txt
//    perl shuffle.pl big3.txt >tmp && mv tmp big3.txt
//
// Example run:  llil4judy big1.txt big2.txt big3.txt >out.txt
// NUM_THREADS=3 llil4judy ...
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#include 
#include 

#include 
#include 

#include 
#include 
#include 

#include 
#include 
#include 

#include 

static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile");

// Specify 0/1 to use boost's parallel sorting algorithm; faster than __gnu_parallel::sort.
// https://www.boost.org/doc/libs/1_85_0/libs/sort/doc/html/sort/parallel.html
// https://www.boost.org/doc/libs/1_85_0/libs/sort/doc/papers/block_indirect_sort_en.pdf
// This requires the boost header files: e.g. devpkg-boost bundle on Clear Linux.
// Note: Another option is downloading and unpacking Boost locally.
// (no need to build it because the bits we use are header file only)
#define USE_BOOST_PARALLEL_SORT 1

#if USE_BOOST_PARALLEL_SORT
   #ifdef __clang__
      #pragma clang diagnostic push
      #pragma clang diagnostic ignored "-Wunused-parameter"
      #pragma clang diagnostic ignored "-Wshadow"
      #include 
      #pragma clang diagnostic pop
   #else
      #include 
   #endif
#endif

#ifdef _OPENMP
   #include 
#endif

class spinlock_mutex {
// https://rigtorp.se/spinlock/
// https://vorbrodt.blog/2019/02/12/fast-mutex/
public:
   // Assignment is disabled.
   spinlock_mutex& operator=(const spinlock_mutex& rhs) = delete;

   void lock() noexcept {
      for (;;) {
         if (!lock_.exchange(true, std::memory_order_acquire))
            break;
         while (lock_.load(std::memory_order_relaxed))
            __builtin_ia32_pause();
      }
   }

   void unlock() noexcept {
      lock_.store(false, std::memory_order_release);
   }

private:
   alignas(4 * sizeof(std::max_align_t)) std::atomic_bool lock_ = false;
};

// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

// Judy wrappers
//
// The Judy C macros are not compatible with C++, so created wrappers.
// One of the difficulties in using the Judy function calls lies in
// determining whether to pass a pointer or the address of a pointer.
// I looked up the functions in /usr/local/include/Judy.h.
//
inline PWord_t judysl_ins( Pvoid_t &(array), const char* index ) {
   return (PWord_t) ::JudySLIns(&(array), (const uint8_t *) index, NULL);
}

inline Word_t judysl_freearray( Pvoid_t &(array) ) {
   return (Word_t) ::JudySLFreeArray(&(array), NULL);
}

inline PWord_t judysl_get( Pcvoid_t array, const char* index ) {
   return (PWord_t) ::JudySLGet(array, (const uint8_t *) index, NULL);
}

inline PWord_t judysl_first( Pcvoid_t array, char* index ) {
   return (PWord_t) ::JudySLFirst(array, (uint8_t *) index, NULL);
}

inline PWord_t judysl_next( Pcvoid_t array, char* index ) {
   return (PWord_t) ::JudySLNext(array, (uint8_t *) index, NULL);
}

// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

typedef uint32_t int_type;

// All words in big1.txt, big2.txt, big3.txt are <= 6 chars in length.
// big.txt  max word length is 6
// long.txt max word length is 208
//
// Based on rough benchmarking, the short fixed string hack below is only
// worth trying for MAX_STR_LEN_L up to about 30.
// See also https://backlinko.com/google-keyword-study
//
// To use (limited length) fixed length strings uncomment the next line.
#define MAX_STR_LEN_L (size_t) 12

#ifdef MAX_STR_LEN_L
#include 
using hash_type        = uint32_t;
using str_type         = boost::static_strings::basic_static_string;
#else
using hash_type        = uint64_t;
using str_type         = std::basic_string;
#endif

using str_int_type     = std::pair;
using vec_str_int_type = std::vector;

// Mimic the Perl get_properties subroutine ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

// convert positive number from string to uint32_t
inline uint32_t fast_atoll64(const char* str)
{
   uint32_t val = 0;
   uint8_t digit;
   while ((digit = uint8_t(*str++ - '0')) <= 9)
      val = val * 10 + digit;
   return val;
}

// Helper function to find a character.
inline char* find_char(char* first, char* last, char c)
{
   while (first != last) {
      if (*first == c) break;
      ++first;
   }
   return first;
}

inline constexpr size_t CHUNK_SIZE   = 32768;
inline constexpr size_t MAX_LINE_LEN = 255;
inline constexpr size_t NUM_MAPS     = 963;

static int64_t get_properties(
   const char*        fname,     // in    : the input file name
   const int          nthds,     // in    : the number of threads
   auto&              L,         // in    : the locks array
   auto&              M,         // inout : the maps array
   auto&              T)         // inout : the totals array
{
   int64_t num_lines = 0;
   std::ifstream fin(fname, std::ifstream::binary);
   if (!fin.is_open()) {
      std::cerr << "Error opening '" << fname << "' : " << strerror(errno) << '\n';
      return num_lines;
   }           
               
   #pragma omp parallel reduction(+:num_lines)
   {              
      std::string buf;
      buf.resize(CHUNK_SIZE + MAX_LINE_LEN + 1, '\0');

      while (fin.good()) {
         size_t len = 0;

         // Read the next chunk serially.
         #pragma omp critical
         {
            fin.read(&buf[0], CHUNK_SIZE);
            if ((len = fin.gcount()) > 0) {
               if (buf[len - 1] != '\n' && fin.getline(&buf[len], MAX_LINE_LEN)) {
                  // Getline discards the newline char and appends null char.
                  // Therefore, change '\0' to '\n'.
                  len += fin.gcount();
                  buf[len - 1] = '\n';
               }
            } 
         }

         if (!len)
            break;

         buf[len]    = '\0';
         char *first = &buf[0];
         char *last  = &buf[len];

         // Process max Nthreads chunks concurrently.
         while (first < last) {
            char* beg_ptr{first};
            char* end_ptr{find_char(first, last, '\n')};
            char* found = find_char(beg_ptr, end_ptr, '\t');

            first = end_ptr + 1;
            if (found == end_ptr)
               continue;

            assert(*found == '\t');
            int_type count = fast_atoll64(found + 1);

#ifdef MAX_STR_LEN_L
            size_t klen = std::min(MAX_STR_LEN_L, (size_t)(found - beg_ptr));
            beg_ptr[klen] = '\0';
#else
            size_t klen = found - beg_ptr;
            beg_ptr[klen] = '\0';
#endif
            size_t hv = std::hash>{}(
               std::basic_string_view{ reinterpret_cast(beg_ptr), klen });

            size_t idx = ( ((hv & 0x000000000000ffffULL) << 16) |
                           ((hv & 0x00000000ffff0000ULL) >> 16) ) % NUM_MAPS;

            if (nthds == 1) {
               PWord_t PValue = judysl_get(M[idx], beg_ptr);
               if (! PValue) {
                  PValue = judysl_ins(M[idx], beg_ptr);
                  ++T[idx];
               }
               (*PValue) += count;
            }
            else {
               L[idx].lock();
               PWord_t PValue = judysl_get(M[idx], beg_ptr);
               if (! PValue) {
                  PValue = judysl_ins(M[idx], beg_ptr);
                  ++T[idx];
               }
               (*PValue) += count;
               L[idx].unlock();
            }

            ++num_lines;
         }
      }
   }

   fin.close();
   // std::cerr << "getprops done\n";
   return num_lines;
}

// Output subroutine ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

size_t divide_up(size_t dividend, size_t divisor)
{
   if (dividend % divisor)
      return (size_t)(dividend / divisor) + 1;
   else
      return (size_t)(dividend / divisor);
}

static void out_properties(
   const int          nthds,     // in   : the number of threads
   vec_str_int_type&  vec)       // in   : the vector to output
{
   size_t num_chunks = divide_up(vec.size(), CHUNK_SIZE);
   int nthds_out = 1;
#ifdef _OPENMP
   nthds_out = std::min(nthds, 32);
#endif

   #pragma omp parallel for ordered schedule(static, 1) num_threads(nthds_out)
   for (size_t chunk_id = 1; chunk_id <= num_chunks; ++chunk_id) {
      std::string str(""); str.reserve(2048 * 1024);
      auto it  = vec.begin() + (chunk_id - 1) * CHUNK_SIZE;
      auto it2 = vec.begin() + std::min(vec.size(), chunk_id * CHUNK_SIZE);

      for (; it != it2; ++it) {
#ifdef MAX_STR_LEN_L
         str.append(it->first.data());
#else
         str.append(it->first.data(), it->first.size());
#endif
         str.append("\t", 1);
         str.append(std::to_string(it->second));
         str.append("\n", 1);
      }

      #pragma omp ordered
      std::cout << str << std::flush;
   }
}

typedef std::chrono::high_resolution_clock high_resolution_clock;
typedef std::chrono::high_resolution_clock::time_point time_point;
typedef std::chrono::milliseconds milliseconds;

double elaspe_time(time_point cend, time_point cstart) {
   return double (
      std::chrono::duration_cast(cend - cstart).count()
   ) * 1e-3;
}

// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

int main(int argc, char* argv[])
{
   if (argc < 2) {
      if (argc > 0)
         std::cerr << "usage: llil4judy file1 file2 ... >out.txt\n";
      return 1;
   }

   std::cerr << std::setprecision(3) << std::setiosflags(std::ios::fixed);
#ifdef MAX_STR_LEN_L
   std::cerr << "llil4judy (fixed string length=" << MAX_STR_LEN_L << ") start\n";
#else
   std::cerr << "llil4judy start\n";
#endif
#ifdef _OPENMP
   std::cerr << "use OpenMP\n";
#else
   std::cerr << "don't use OpenMP\n";
#endif
#if USE_BOOST_PARALLEL_SORT == 0
   std::cerr << "don't use boost sort\n";
#else
   std::cerr << "use boost sort\n";
#endif
   time_point cstart1, cend1, cstart2, cend2, cstart3, cend3s, cend3;
   cstart1 = high_resolution_clock::now();

#ifdef _OPENMP
   // Determine the number of threads.
   const char* env_nthds = std::getenv("NUM_THREADS");
   int nthds = ( env_nthds && strlen(env_nthds) )
      ? ::atoi(env_nthds)
      : std::thread::hardware_concurrency();
   omp_set_dynamic(false);
   omp_set_num_threads(nthds);
   omp_set_max_active_levels(1);
   int nthds_move = std::min(nthds, 12);
#else
   int nthds = 1;
   int nthds_move = 1;
#endif

   // Get the list of input files from the command line
   int    nfiles = argc - 1;
   char** fname  = &argv[1];

   spinlock_mutex L[NUM_MAPS];
   Pvoid_t        M[NUM_MAPS];
   std::size_t    T[NUM_MAPS] {};

   for (size_t i = 0; i < NUM_MAPS; ++i)
      M[i] = (Pvoid_t) NULL;

   int64_t num_lines = 0;
   for (int i = 0; i < nfiles; ++i)
      num_lines += get_properties(fname[i], nthds, L, M, T);

   int64_t num_keys = 0;
   for (size_t i = 0; i < NUM_MAPS; ++i)
      num_keys += T[i];

   cend1 = high_resolution_clock::now();
   double ctaken1 = elaspe_time(cend1, cstart1);
   std::cerr << "get properties      " << std::setw(8) << ctaken1 << " secs\n";

   cstart2 = high_resolution_clock::now();

   // Store the properties into a vector
   vec_str_int_type propvec;

   propvec.resize(num_keys);
   std::array I;
   I[0] = propvec.begin();

   for (size_t i = 1; i < NUM_MAPS; ++i)
      I[i] = I[i-1] + T[i-1];

   #pragma omp parallel for schedule(static, 1) num_threads(nthds_move)
   for (size_t i = 0; i < NUM_MAPS; ++i) {
      char index[MAX_LINE_LEN+1]; index[0] = '\0';
      PWord_t PValue;
      auto it = I[i];

      PValue = judysl_first(M[i], index);
      while (PValue != NULL) {
         *it++ = std::make_pair(index, *PValue);
         PValue = judysl_next(M[i], index);
      }
   }

   cend2 = high_resolution_clock::now();
   double ctaken2 = elaspe_time(cend2, cstart2);
   std::cerr << "judy to vector      " << std::setw(8) << ctaken2 << " secs\n";

   if (!propvec.size()) {
      std::cerr << "No work, exiting...\n";
      return 1;
   }

   cstart3 = high_resolution_clock::now();

   // Sort the vector by (count) in reverse order, (name) in lexical order
   auto reverse_order = [](const str_int_type& left, const str_int_type& right) {
      return left.second != right.second
         ? left.second > right.second
         #ifdef MAX_STR_LEN_L
         : ::memcmp(left.first.data(), right.first.data(), MAX_STR_LEN_L) < 0;
         #else
         : left.first < right.first;
         #endif
   };
#if USE_BOOST_PARALLEL_SORT == 0
   // Standard sort
   std::sort(propvec.begin(), propvec.end(), reverse_order);
#else
   // Parallel sort
   boost::sort::block_indirect_sort(
      propvec.begin(), propvec.end(), reverse_order,
   #ifdef __NVCOMPILER_LLVM__
      std::min(nthds, 32)
   #else
      nthds
   #endif
   );
#endif

   cend3s = high_resolution_clock::now();

   // Output the sorted vector
   out_properties(nthds, propvec);
   cend3 = high_resolution_clock::now();

   double ctaken   = elaspe_time(cend3, cstart1);
   double ctaken3s = elaspe_time(cend3s, cstart3);
   double ctaken3o = elaspe_time(cend3, cend3s);

   std::cerr << "vector stable sort  " << std::setw(8) << ctaken3s << " secs\n";
   std::cerr << "write stdout        " << std::setw(8) << ctaken3o << " secs\n";
   std::cerr << "total time          " << std::setw(8) << ctaken   << " secs\n";
   std::cerr << "    count lines     " << num_lines << "\n";
   std::cerr << "    count unique    " << num_keys << "\n";

   // Hack to see Private Bytes in Windows Task Manager
   // (uncomment next line so process doesn't exit too quickly)
   // std::this_thread::sleep_for(milliseconds(9000));

   return 0;
}