. $HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/env/vars.sh ##

##

g++ -o llil3vec-tbb -std=c++20 -Wall -O3 -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" 
llil3vec-tbb.cpp -l tbb

##

##

$ locale -a
C
C.utf8
POSIX

##

##

    // std::cout.imbue(std::locale{"en_US.UTF8"});
    std::cout.imbue(std::locale{"C.utf8"});

##

##

using map_str_int_type = std::map;

##

##

using map_str_int_type = tbb::concurrent_map;

##

##

// llil3vec-tbb.cpp.
// Vector version of llil2grt.pl.
// g++ compile on Linux:
//    g++ -o llil3vec-tbb -std=c++20 -Wall -O3 llil3vec-tbb.cpp
//    g++ -o llil3vec-tbb -std=c++20 -Wall -O3 -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" llil3vec-tbb.cpp -l tbb
// This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64)
// that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe).
// Example run: llil3vec-tbb big1.txt big2.txt big3.txt >vec.tmp

// Uncomment to use Intel TBB library
#define USE_TBB_L 1
// Uncomment to use TBB concurrent vector (best to avoid this -- too much locking overhead)
// #define USE_CONCURRENT_VECTOR_L 1

// Maximum allowed number of input files
#define MAX_INPUT_FILES_L  64

#include 

#include 
#include 
#include 
#include 
#include 

#include 
#include 
#include 
#include 

#include 
#include 

#include 
#include 

#ifdef USE_TBB_L
#include 
#include 
#ifdef USE_CONCURRENT_VECTOR_L
#include 
#endif
#endif

#include 
#include 
#include 

static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile");

// ----------------------------------------------------------------------------

// Crude hack to see Windows Private Bytes in Task Manager by sleeping at
// program end (see also sleep hack at end of main)
//    #include 
//    #include 

// ----------------------------------------------------------------------------

typedef long long llil_int_type;

// Note: all words in big1.txt, big2.txt, big3.txt are <= 6 chars in length
// To use (limited length) fixed length strings uncomment the next line
// big.txt  max word length is 6
// long.txt max word length is 208
// Based on rough benchmarking, this short fixed string hack
// is only worth trying for MAX_STR_LEN_L up to about 64
#define MAX_STR_LEN_L 6

#ifdef MAX_STR_LEN_L
using str_type         = std::array;
#else
using str_type         = std::string;
#endif

using str_int_type     = std::pair;
using int_str_type     = std::pair;

using vec_str_int_type = std::vector;
using set_int_str_type = std::set;

#ifdef USE_CONCURRENT_VECTOR_L
using vec_int_str_type = tbb::concurrent_vector;
#else
using vec_int_str_type = std::vector;
#endif

// Mimic the Perl get_properties subroutine ----------------------------

// Limit line length and use ANSI C functions to try to boost performance
#define MAX_LINE_LEN_L 255

static void get_properties_one_file(
   const char*        fname,       //  in: the input file name
   vec_int_str_type&  vec_ret)     // out: a vector of properties
{
   FILE* fh;
   char line[MAX_LINE_LEN_L+1];
   char* found;
   llil_int_type count;

   fh = ::fopen(fname, "r");
   if (fh == NULL) {
      std::cerr << "Error opening '" << fname << "' : errno=" << errno << "\n";
      return;
   }
   while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) {
      found = ::strchr(line, '\t');
      count = ::atoll( &line[found - line + 1] );
      line[found - line] = '\0';  // word
#ifdef MAX_STR_LEN_L
      str_type fixword { { '\0', '\0', '\0', '\0', '\0', '\0' } };
      ::memcpy( fixword.data(), line, found - line );
      vec_ret.emplace_back( -count, fixword );
#else
      vec_ret.emplace_back( -count, line );
#endif
   }
   ::fclose(fh);
}

static void get_properties(
   int                nfiles,      //  in: the number of input files
   char*              fname[],     //  in: the input file names
   vec_int_str_type&  vec_ret)     // out: a vector of properties
{
#ifdef USE_TBB_L
#ifdef USE_CONCURRENT_VECTOR_L
   tbb::parallel_for( tbb::blocked_range(0, nfiles),
                       [&](tbb::blocked_range r)
   {
      for (int i = r.begin(); i < r.end(); ++i)
         get_properties_one_file( fname[i], vec_ret );
   });
#else
   // Use array of local vectors to parallelise without locking overhead
   // of using tbb::concurrent_vector
   vec_int_str_type locvec[MAX_INPUT_FILES_L];
   tbb::parallel_for( tbb::blocked_range(0, nfiles),
                       [&](tbb::blocked_range r)
   {
      for (int i = r.begin(); i < r.end(); ++i)
         get_properties_one_file( fname[i], locvec[i] );
   });
#endif
#else
  // Not using TBB, just keep on appending to vec_ret
  for (int i = 0; i < nfiles; ++i)
     get_properties_one_file( fname[i], vec_ret );
#endif

#ifdef USE_TBB_L
#ifdef USE_CONCURRENT_VECTOR_L
  // nothing to do
#else
   for (int i = 0; i < nfiles; ++i)
      vec_ret.insert( vec_ret.end(), locvec[i].begin(), locvec[i].end() );
#endif
#endif

   // Needs to be sorted by word for later sum of adjacent count fields to work
#ifdef USE_TBB_L
   tbb::parallel_sort(
#else
   std::sort(
#endif
      vec_ret.begin(), vec_ret.end(),
      [](const int_str_type& left, const int_str_type& right) { return left.second < right.second; }
   );
}

// ---------------------------------------------------------------------

int main(int argc, char* argv[])
{
   if (argc < 2) {
      std::cerr << "usage: llil3vec-tbb file1 file2 ... >out.txt\n";
      return 1;
   }
   if (argc > MAX_INPUT_FILES_L) {
      std::cerr << "sorry too many input files, must be less than " << MAX_INPUT_FILES_L << "\n";
      return 1;
   }

#ifdef MAX_STR_LEN_L
   std::cerr << "llil3vec-tbb (fixed string length=" << MAX_STR_LEN_L << ") start\n";
#else
   std::cerr << "llil3vec-tbb start\n";
#endif
#ifdef USE_TBB_L
   std::cerr << "use TBB\n";
#ifdef USE_CONCURRENT_VECTOR_L
   std::cerr << "use concurrent vector\n";
#endif
#else
   std::cerr << "don't use TBB\n";
#endif
   time_t  tstart1 = ::time(NULL);
   clock_t cstart1 = ::clock();

   // Create the vector of properties
   vec_int_str_type vec_ret;
   get_properties(argc - 1, &argv[1], vec_ret);

   clock_t cend1  = ::clock();
   double ctaken1 = (double) (cend1 - cstart1) / (double)CLOCKS_PER_SEC;
   std::cerr << "get_properties      CPU time : " << ctaken1 << " secs\n";

   clock_t cstart2 = ::clock();

   // To avoid calling sort(), create an inverted std::set container
   // Note: negative count gives desired ordering
   set_int_str_type myset;
   auto it = vec_ret.cbegin();
   str_type name_last  = it->second;
   llil_int_type count = it->first;
   for (++it; it != vec_ret.cend(); ++it) {
      if ( it->second == name_last ) {
         count += it->first;
      }
      else {
         myset.emplace_hint( myset.end(), count, name_last );
         name_last = it->second;
         count     = it->first;
      }
   }
   myset.emplace_hint( myset.end(), count, name_last );

   clock_t cend2s = ::clock();

   // Output the (already sorted) std::set - no sort() function required
   // Note: fix up negative count via -n.first
#ifdef MAX_STR_LEN_L
   // If name is not NULL-terminated:
   for ( auto const& n : myset ) ::printf( "%.*s\t%lld\n", MAX_STR_LEN_L, n.second.data(), -n.first );
   // If name is NULL-terminated:
   // for ( auto const& n : myset ) ::printf( "%s\t%lld\n", n.second.data(), -n.first );
   // for ( auto const& n : myset ) std::cout << n.second.data() << '\t' << -n.first << '\n';
#else
   // Can try printf vs std::cout to see which is faster
   for ( auto const& n : myset ) ::printf( "%s\t%lld\n", n.second.c_str(), -n.first );
   // for ( auto const& n : myset ) std::cout << n.second << '\t' << -n.first << '\n';
#endif

   clock_t cend2 = ::clock();
   time_t  tend2 = ::time(NULL);

   long    ttaken   = static_cast(::difftime(tend2, tstart1) + 0.5);
   double  ctaken   = (double) (cend2 - cstart1) / (double)CLOCKS_PER_SEC;

   double  ctaken2s = (double) (cend2s - cstart2) / (double)CLOCKS_PER_SEC;
   double  ctaken2o = (double) (cend2  - cend2s)  / (double)CLOCKS_PER_SEC;

   std::cerr << "emplace set sort    CPU time : " << ctaken2s << " secs\n";
   std::cerr << "write stdout        CPU time : " << ctaken2o << " secs\n";

   std::cerr << "total               CPU time : " << ctaken   << " secs\n";
   std::cerr << "total        wall clock time : " << ttaken   << " secs\n";

   // Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly)
   //   std::this_thread::sleep_for(std::chrono::milliseconds(90000000));

   return 0;
}

##

##

OpenMp version:
$ time ./llil4vec_p big1.txt big2.txt big3.txt >vec.tmp
llil2vec (fixed string length=6) start
get_properties      time : 0.853868 secs
emplace set sort    time : 0.972116 secs
write stdout        time : 0.875946 secs
total               time : 2.70229 secs

real    0m3.041s
user    0m5.553s
sys     0m0.745s

##

##

OneTbb version:
$ time ./llil3vec-tbb big1.txt big2.txt big3.txt >f.tmp
llil3vec-tbb (fixed string length=6) start
use TBB
get_properties      CPU time : 3.30477 secs
emplace set sort    CPU time : 0.825981 secs
write stdout        CPU time : 0.866964 secs
total               CPU time : 4.99808 secs
total        wall clock time : 3 secs

real    0m2.890s
user    0m4.687s
sys     0m0.646s

##

##

$ time ./llil3vec-tbb-a big1.txt big2.txt big3.txt >f.tmp
llil3vec-tbb-a (fixed string length=6) start
use TBB
get_properties      CPU time : 2.9722 secs
emplace set sort    CPU time : 0.61156 secs
write stdout        CPU time : 0.828023 secs
total               CPU time : 4.41257 secs
total        wall clock time : 3 secs

real    0m2.552s
user    0m4.304s
sys     0m0.438s

##

##

clang++ -o llil3vec-tbb-a -std=c++20 -Wall -O3 -I "$HOME/llil/cmdlinux/fast_io/include" -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" llil3vec-tbb-a.cpp -l tbb

##

##

// llil3vec-tbb-a.cpp.
// Vector version using the Intel TBB library
// Note: TBB concurrent vector is best avoided, too much locking overhead
// g++ compile on Linux:
//    g++ -o llil3vec-tbb-a -std=c++20 -Wall -O3 llil3vec-tbb-a.cpp
//    g++ -o llil3vec-tbb-a -std=c++20 -Wall -O3 -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" llil3vec-tbb-a.cpp -l tbb
// Seems to run slightly faster when compiled with clang++ instead of g++
// This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64)
// that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe).
// Example run: llil3vec-tbb-a big1.txt big2.txt big3.txt >vec.tmp

// Uncomment to use Intel TBB library
#define USE_TBB_L 1

// Maximum allowed number of input files
#define MAX_INPUT_FILES_L  64

#include 
#include 
#include 
#include 
#include 
#include 

#include 
#include 
#include 
#include 

#include 
#include 

#include 
#include 

#ifdef USE_TBB_L
#include 
#include 
#endif

#include 
#include 
#include 

static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile");

// ----------------------------------------------------------------------------

// Crude hack to see Windows Private Bytes in Task Manager by sleeping at
// program end (see also sleep hack at end of main)
//    #include 
//    #include 

// ----------------------------------------------------------------------------

typedef long long llil_int_type;

// Note: all words in big1.txt, big2.txt, big3.txt are <= 6 chars in length
// To use (limited length) fixed length strings uncomment the next line
// big.txt  max word length is 6
// long.txt max word length is 208
// The standard big1.txt, big2.txt, big3.txt files all contain 3,515,200 lines
#define N_LINES_BIG1_L 3515200

// Based on rough benchmarking, this short fixed string hack
// is only worth trying for MAX_STR_LEN_L up to about 64
#define MAX_STR_LEN_L 6

#ifdef MAX_STR_LEN_L
using str_type         = std::array;
#else
using str_type         = std::string;
#endif

using str_int_type     = std::pair;
using int_str_type     = std::pair;

using vec_str_int_type = std::vector;
using vec_int_str_type = std::vector;
using set_int_str_type = std::set;

// Mimic the Perl get_properties subroutine ----------------------------

// Limit line length and use ANSI C functions to try to boost performance
#define MAX_LINE_LEN_L 255

static void get_properties(
   const char*        fname,       //  in: the input file name
   vec_int_str_type&  vec_ret)     // out: a vector of properties
{
   FILE* fh;
   char line[MAX_LINE_LEN_L+1];
   char* found;
   llil_int_type count;

   fh = ::fopen(fname, "r");
   if (fh == NULL) {
      std::cerr << "Error opening '" << fname << "' : errno=" << errno << "\n";
      return;
   }
   while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) {
      found = ::strchr(line, '\t');
      count = ::atoll( &line[found - line + 1] );
      line[found - line] = '\0';  // word
#ifdef MAX_STR_LEN_L
      str_type fixword { { '\0', '\0', '\0', '\0', '\0', '\0' } };
      ::memcpy( fixword.data(), line, found - line );
      vec_ret.emplace_back( -count, fixword );
#else
      vec_ret.emplace_back( -count, line );
#endif
   }
   ::fclose(fh);
}

// ---------------------------------------------------------------------

int main(int argc, char* argv[])
{
   if (argc < 2) {
      std::cerr << "usage: llil3vec-tbb-a file1 file2 ... >out.txt\n";
      return 1;
   }
   if (argc > MAX_INPUT_FILES_L) {
      std::cerr << "sorry too many input files, must be less than " << MAX_INPUT_FILES_L << "\n";
      return 1;
   }

#ifdef MAX_STR_LEN_L
   std::cerr << "llil3vec-tbb-a (fixed string length=" << MAX_STR_LEN_L << ") start\n";
#else
   std::cerr << "llil3vec-tbb-a start\n";
#endif
#ifdef USE_TBB_L
   std::cerr << "use TBB\n";
#else
   std::cerr << "don't use TBB\n";
#endif
   time_t  tstart1 = ::time(NULL);
   clock_t cstart1 = ::clock();

   // Get the list of input files from the command line
   int    nfiles = argc - 1;
   char** fname  = &argv[1];

   // Create the vector of properties
   vec_int_str_type propvec;
   // propvec.reserve(N_LINES_BIG1_L * 3);    // doesn't make much difference

#ifdef USE_TBB_L
   // Use an array of local vectors to parallelise
   vec_int_str_type locvec[MAX_INPUT_FILES_L];
   tbb::parallel_for( tbb::blocked_range(0, nfiles),
                       [&](tbb::blocked_range r)
   {
      for (int i = r.begin(); i < r.end(); ++i)
         get_properties( fname[i], locvec[i] );
   });

   // Append local vectors to propvec
   // Aside: Try setting locvec[0] to propvec and use/append one less local vector?
   for (int i = 0; i < nfiles; ++i)
      propvec.insert( propvec.end(), locvec[i].begin(), locvec[i].end() );
#else
  // Not using TBB: so just keep on appending to propvec
  for (int i = 0; i < nfiles; ++i)
     get_properties( fname[i], propvec );
#endif

   // Needs to be sorted by word for later sum of adjacent count fields to work
#ifdef USE_TBB_L
   tbb::parallel_sort(
#else
   std::sort(
#endif
      propvec.begin(), propvec.end(),
      [](const int_str_type& left, const int_str_type& right) { return left.second < right.second; }
   );

   clock_t cend1  = ::clock();
   double ctaken1 = (double) (cend1 - cstart1) / (double)CLOCKS_PER_SEC;
   std::cerr << "get_properties      CPU time : " << ctaken1 << " secs\n";

   clock_t cstart2 = ::clock();

   // To avoid calling sort(), create an inverted std::set container
   // Note: negative count gives desired ordering
   // Aside: try parallel std::reduce/std::accumulate here? (see also [wp://MapReduce])
   set_int_str_type myset;
   auto it = propvec.cbegin();
   str_type name_last  = it->second;
   llil_int_type count = it->first;
   for (++it; it != propvec.cend(); ++it) {
      if ( it->second == name_last ) {
         count += it->first;
      }
      else {
         myset.emplace_hint( myset.end(), count, name_last );
         name_last = it->second;
         count     = it->first;
      }
   }
   myset.emplace_hint( myset.end(), count, name_last );

   clock_t cend2s = ::clock();

   // Output the (already sorted) std::set - no sort() function required
   // Note: fix up negative count via -n.first
#ifdef MAX_STR_LEN_L
   // If name is not NULL-terminated:
   for ( auto const& n : myset ) ::printf( "%.*s\t%lld\n", MAX_STR_LEN_L, n.second.data(), -n.first );
   // If name is NULL-terminated:
   // for ( auto const& n : myset ) ::printf( "%s\t%lld\n", n.second.data(), -n.first );
   // for ( auto const& n : myset ) std::cout << n.second.data() << '\t' << -n.first << '\n';
#else
   // Can try printf vs std::cout to see which is faster
   for ( auto const& n : myset ) ::printf( "%s\t%lld\n", n.second.c_str(), -n.first );
   // for ( auto const& n : myset ) std::cout << n.second << '\t' << -n.first << '\n';
#endif

   clock_t cend2 = ::clock();
   time_t  tend2 = ::time(NULL);

   long    ttaken   = static_cast(::difftime(tend2, tstart1) + 0.5);
   double  ctaken   = (double) (cend2 - cstart1) / (double)CLOCKS_PER_SEC;

   double  ctaken2s = (double) (cend2s - cstart2) / (double)CLOCKS_PER_SEC;
   double  ctaken2o = (double) (cend2  - cend2s)  / (double)CLOCKS_PER_SEC;

   std::cerr << "emplace set sort    CPU time : " << ctaken2s << " secs\n";
   std::cerr << "write stdout        CPU time : " << ctaken2o << " secs\n";

   std::cerr << "total               CPU time : " << ctaken   << " secs\n";
   std::cerr << "total        wall clock time : " << ttaken   << " secs\n";

   // Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly)
   //   std::this_thread::sleep_for(std::chrono::milliseconds(90000000));

   return 0;
}