cd $HOME/local-boost
wget https://boostorg.jfrog.io/artifactory/main/release/1.81.0/source/boost_1_81_0_rc1.tar.gz.json
wget https://boostorg.jfrog.io/artifactory/main/release/1.81.0/source/boost_1_81_0_rc1.tar.gz
tar -xzf boost_1_81_0_rc1.tar.gz
####
clang++ -o llil5vec-tbb -std=c++20 -Wall -O3 -I "$HOME/llil/cmdlinux/fast_io/include" -I "$HOME/local-boost/boost_1_81_0" -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" llil5vec-tbb.cpp -l tbb
##
##
block_indirect_sort.hpp:274:23: warning: implicit capture of ‘this’ via ‘[=]’ is deprecated in C++20 [-Wdeprecated]
##
##
$ NUM_THREADS=6 ./llil4vec-tbb big1.txt big2.txt big3.txt >f.tmp
llil4vec-tbb (fixed string length=6) start
use TBB
get properties time : 0.399307 secs
sort properties time : 0.277303 secs
emplace set sort time : 0.764016 secs
write stdout time : 0.51597 secs
total time : 1.95691 secs
##
##
$ NUM_THREADS=6 ./llil5vec-tbb big1.txt big2.txt big3.txt >big-5vec.tmp
llil5vec-tbb (fixed string length=6) start
use TBB
use boost sort
get properties time : 0.367148 secs
sort properties time : 0.274697 secs
vector stable sort time : 0.385357 secs
write stdout time : 0.390569 secs
total time : 1.4179 secs
$ diff big-5vec.tmp big-3vec.tmp
##
##
$ NUM_THREADS=6 ./llil4vec2 big1.txt big2.txt big3.txt >f.tmp
llil4vec2 (fixed string length=6) start
use OpenMP
use boost sort
get properties time : 0.353 secs
sort properties time : 0.271 secs
vector reduce time : 0.074 secs
vector stable sort time : 0.197 secs
write stdout time : 0.322 secs
total time : 1.219 secs
##
##
$ NUM_THREADS=6 ./llil4vec2 big1.txt big2.txt big3.txt >f.tmp
llil4vec2 (fixed string length=6) start
use OpenMP
use boost sort
get properties time : 0.321 secs
sort properties time : 0.268 secs
vector reduce time : 0.078 secs
vector stable sort time : 0.194 secs
write stdout time : 0.329 secs
total time : 1.192 secs
##
##
$ NUM_THREADS=6 ./llil4vec2 big1.txt big2.txt big3.txt >f.tmp
llil4vec2 (fixed string length=6) use memcmp start
use OpenMP
use boost sort
get properties time : 0.311 secs
sort properties time : 0.261 secs
vector reduce time : 0.030 secs
vector stable sort time : 0.199 secs
write stdout time : 0.316 secs
total time : 1.120 secs
##
##
$ LD_PRELOAD=/usr/local/lib/libmimalloc.so NUM_THREADS=6 ./llil4vec-new big1.txt big2.txt big3.txt >f.tmp
llil4vec (fixed string length=6) start
use OpenMP
use boost sort
nthrs=6
get properties 0.192 secs
sort properties 0.261 secs
vector reduce 0.033 secs
vector stable sort 0.186 secs
write stdout 0.311 secs
total time 0.986 secs
$ LD_PRELOAD=/usr/local/lib/libmimalloc.so NUM_THREADS=6 ./llil4vec-new big1.txt big2.txt big3.txt >f.tmp
llil4vec (fixed string length=6) start
use OpenMP
use boost sort
nthrs=6
get properties 0.201 secs
sort properties 0.257 secs
vector reduce 0.032 secs
vector stable sort 0.194 secs
write stdout 0.300 secs
total time 0.986 secs
$ cmp f.tmp big-3vec.tmp
##
##
$ NUM_THREADS=6 ./llil4map big1.txt big2.txt big3.txt >f.tmp
llil4map start
use phmap::parallel_flat_hash_map
use OpenMP
use boost sort
get properties time : 1.6778 secs
finish merging time : 0.754921 secs
vector stable sort time : 0.76791 secs
write stdout time : 0.382316 secs
total time : 3.5832 secs
##
##
$ NUM_THREADS=6 LD_LIBRARY_PATH=/usr/local/lib ./llil4judy big1.txt big2.txt big3.txt >f.tmp
llil4judy (fixed string length=6) start
use OpenMP
use boost sort
get properties 0.682 secs
finish merging 2.338 secs
vector stable sort 0.262 secs
write stdout 0.526 secs
total time 3.810 secs
$ cmp f.tmp big-3vec.tmp
##
##
$ NUM_THREADS=6 ./llil4vec2 big?.txt big?.txt big?.txt >f4vec.tmp
llil4vec2 (fixed string length=6) start
use OpenMP
use boost sort
get properties time : 1.9719 secs
sort properties time : 2.27273 secs
vector reduce time : 0.201546 secs
vector stable sort time : 0.500317 secs
write stdout time : 0.721466 secs
total time : 5.66814 secs
$ NUM_THREADS=6 ./llil4map big?.txt big?.txt big?.txt >f4map.tmp
llil4map start
use phmap::parallel_flat_hash_map
use OpenMP
use boost sort
get properties time : 5.33992 secs
finish merging time : 2.20465 secs
vector stable sort time : 1.60465 secs
write stdout time : 1.21258 secs
total time : 10.3622 secs
$ diff f4map.tmp f4vec.tmp
$ ls -l f4map.tmp f4vec.tmp
-rw-r--r-- 183490874 Feb 1 09:38 f4map.tmp
-rw-r--r-- 183490874 Feb 1 09:37 f4vec.tmp
$ ls -l big*.txt
-rw-r--r-- 31636800 Jan 16 18:26 big1.txt
-rw-r--r-- 31636800 Jan 16 18:26 big2.txt
-rw-r--r-- 31636800 Jan 16 18:26 big3.txt
-rw-r--r-- 31636800 Feb 1 09:33 big4.txt
-rw-r--r-- 31636800 Feb 1 09:31 big5.txt
-rw-r--r-- 31636800 Feb 1 09:31 big6.txt
##
##
Update: a later version with improved get_properties():
$ NUM_THREADS=6 ./llil4vec2 big?.txt big?.txt big?.txt big?.txt big?.txt big?.txt >f4vec.tmp
llil4vec2 (fixed string length=6) start
use OpenMP
use boost sort
get properties time : 4.392 secs
sort properties time : 4.706 secs
vector reduce time : 0.225 secs
vector stable sort time : 0.491 secs
write stdout time : 0.766 secs
total time : 10.593 secs
$ NUM_THREADS=6 ./llil4map big?.txt big?.txt big?.txt big?.txt big?.txt big?.txt >f4map.tmp
llil4map start
use phmap::parallel_flat_hash_map
use OpenMP
use boost sort
get properties time : 10.6884 secs
finish merging time : 2.11571 secs
vector stable sort time : 1.83454 secs
write stdout time : 2.55906 secs
total time : 17.198 secs
$ diff f4map.tmp f4vec.tmp
##
##
// llil5vec-tbb.cpp
// Based on llil4vec-tbb.cpp in https://perlmonks.com/?node_id=11149687
//
// Vector version using the Intel TBB library
// Note: TBB concurrent vector is best avoided, too much locking overhead
// based on llil3vec-tbb-a.cpp https://perlmonks.com/?node_id=11149622
// 1. Capture time and diff via chrono.
// 2. Key words null-terminated for MAX_STR_LEN_L.
// 3. Concat strings using fast_io::concatln during output.
// 4. Support NUM_THREADS environment variable.
// 5. Merge local vector inside the parallel_for block.
// This allows threads, completing get_properties, to merge immediately.
// 6. Moved vec_loc instantiation inside for loop.
// 7. Support running one thread, writing to propvec (no merging).
// 8. Capture time for get and sort properties separately.
// 9. Added define for using boost's parallel sort; requires clang++.
//
// To obtain the fast_io library (required dependency):
// git clone --depth=1 https://github.com/cppfastio/fast_io
//
// Compiled on Linux with:
// clang++ -o llil5vec-tbb -std=c++20 -Wall -O3 -I "$HOME/local-fast_io/fast_io/include" -I "$HOME/local-boost/boost_1_81_0" -I "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/include" -L "$HOME/local-oneapi-tbb/oneapi-tbb-2021.7.0/lib" llil5vec-tbb.cpp -l tbb
// Also works with g++ but throws some compiler warnings about deprecated C++20 features.
// Seems to run slightly faster when compiled with clang++ rather than g++
// Uncomment to use Intel TBB library
#define USE_TBB_L 1
// Specify 0/1 to use boost's parallel sorting algorithm; faster than __gnu_parallel::sort.
// https://www.boost.org/doc/libs/1_81_0/libs/sort/doc/html/sort/parallel.html
// This requires the boost header files: e.g. devpkg-boost bundle on Clear Linux.
// Note: I was able to build this program just by downloading and unpacking Boost locally
// (no need to build it because the bits we use are header file only)
#define USE_BOOST_PARALLEL_SORT 1
#include
#include
// The fast_io header must come after chrono, else build error:
// "no member named 'concatln' in namespace 'fast_io'"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#if USE_BOOST_PARALLEL_SORT > 0
#include
#else
#endif
#include
#include
#ifdef USE_TBB_L
#include
#include
#include
#include
#endif
#include
#include
#include
static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile");
// ----------------------------------------------------------------------------
// Crude hack to see Windows Private Bytes in Task Manager by sleeping at
// program end (see also sleep hack at end of main)
// #include
// #include
// ----------------------------------------------------------------------------
typedef long long llil_int_type;
// Note: all words in big1.txt, big2.txt, big3.txt are <= 6 chars in length
// To use (limited length) fixed length strings uncomment the next line
// big.txt max word length is 6
// long.txt max word length is 208
// The standard big1.txt, big2.txt, big3.txt files all contain 3,515,200 lines
#define N_LINES_BIG1_L 3515200
// Based on rough benchmarking, the short fixed string hack below is only
// worth trying for MAX_STR_LEN_L up to about 22.
// See also https://backlinko.com/google-keyword-study
#define MAX_STR_LEN_L 6
#ifdef MAX_STR_LEN_L
using str_type = std::array;
#else
using str_type = std::string;
#endif
using str_int_type = std::pair;
using int_str_type = std::pair;
using vec_str_int_type = std::vector;
using vec_int_str_type = std::vector;
// fast_atoll64 --------------------------------------------------------
//
// https://stackoverflow.com/questions/16826422/
// c-most-efficient-way-to-convert-string-to-int-faster-than-atoi
inline int64_t fast_atoll64( const char* str )
{
int64_t val = 0;
int sign = 0;
if ( *str == '-' ) {
sign = 1, ++str;
}
uint8_t digit;
while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit;
return sign ? -val : val;
}
// Mimic the Perl get_properties subroutine ----------------------------
// Limit line length and use ANSI C functions to try to boost performance
#define MAX_LINE_LEN_L 255
static void get_properties(
const char* fname, // in: the input file name
vec_int_str_type& vec_ret) // out: a vector of properties
{
FILE* fh;
char line[MAX_LINE_LEN_L+1];
char* found;
llil_int_type count;
fh = ::fopen(fname, "r");
if (fh == NULL) {
std::cerr << "Error opening '" << fname << "' : errno=" << errno << "\n";
return;
}
while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) {
found = ::strchr(line, '\t');
// fast_atoll64 is a touch faster than ::atoll
count = fast_atoll64( &line[found - line + 1] );
line[found - line] = '\0'; // word
#ifdef MAX_STR_LEN_L
str_type fixword { { '\0', '\0', '\0', '\0', '\0', '\0', '\0' } };
::memcpy( fixword.data(), line, found - line );
vec_ret.emplace_back( -count, fixword );
#else
vec_ret.emplace_back( -count, line );
#endif
}
::fclose(fh);
}
double elaspe_time(
std::chrono::high_resolution_clock::time_point cend,
std::chrono::high_resolution_clock::time_point cstart)
{
return double(
std::chrono::duration_cast(cend - cstart).count()
) * 1e-6;
}
// ---------------------------------------------------------------------
int main(int argc, char* argv[])
{
if (argc < 2) {
std::cerr << "usage: llil5vec-tbb file1 file2 ... >out.txt\n";
return 1;
}
#ifdef MAX_STR_LEN_L
std::cerr << "llil5vec-tbb (fixed string length=" << MAX_STR_LEN_L << ") start\n";
#else
std::cerr << "llil5vec-tbb start\n";
#endif
#ifdef USE_TBB_L
std::cerr << "use TBB\n";
#else
std::cerr << "don't use TBB\n";
#endif
#if USE_BOOST_PARALLEL_SORT == 0
std::cerr << "don't use boost sort\n";
#else
std::cerr << "use boost sort\n";
#endif
std::chrono::high_resolution_clock::time_point cstart1, cend1, cstart2, cend2, cstart3, cend3s, cend3;
cstart1 = std::chrono::high_resolution_clock::now();
#ifdef USE_TBB_L
// Determine the number of threads.
const char* env_nthrs = std::getenv("NUM_THREADS");
int nthrs = (env_nthrs && strlen(env_nthrs)) ? ::atoi(env_nthrs) : std::thread::hardware_concurrency();
tbb::global_control global_limit(tbb::global_control::max_allowed_parallelism, nthrs);
#else
int nthrs = 1;
#endif
// Get the list of input files from the command line
int nfiles = argc - 1;
char** fname = &argv[1];
// Create the vector of properties
vec_int_str_type propvec;
// propvec.reserve(N_LINES_BIG1_L * 3); // doesn't make much difference
// Run parallel, depending on the number of threads
if ( nthrs == 1 ) {
for (int i = 0; i < nfiles; ++i)
get_properties( fname[i], propvec );
}
#ifdef USE_TBB_L
else {
tbb::parallel_for( tbb::blocked_range(0, nfiles, 1),
[&](tbb::blocked_range r)
{
for (int i = r.begin(); i < r.end(); ++i) {
vec_int_str_type locvec;
get_properties( fname[i], locvec );
// A static mutex that is shared across all threads
static tbb::spin_mutex mtx;
// Acquire a scoped lock
tbb::spin_mutex::scoped_lock lock(mtx);
// Append local vector to propvec
propvec.insert( propvec.end(), locvec.begin(), locvec.end() );
}
});
}
#endif
cend1 = std::chrono::high_resolution_clock::now();
double ctaken1 = elaspe_time(cend1, cstart1);
std::cerr << "get properties time : " << ctaken1 << " secs\n";
cstart2 = std::chrono::high_resolution_clock::now();
// Needs to be sorted by word for later sum of adjacent count fields to work
#if USE_BOOST_PARALLEL_SORT == 0
#ifdef USE_TBB_L
tbb::parallel_sort(
#else
std::sort(
#endif
propvec.begin(), propvec.end(),
[](const int_str_type& left, const int_str_type& right) { return left.second < right.second; }
);
#else
// Try building with clang++ if g++ emits errors
boost::sort::block_indirect_sort(
propvec.begin(), propvec.end(),
[](const int_str_type& left, const int_str_type& right) { return left.second < right.second; },
nthrs
);
#endif
cend2 = std::chrono::high_resolution_clock::now();
double ctaken2 = elaspe_time(cend2, cstart2);
std::cerr << "sort properties time : " << ctaken2 << " secs\n";
cstart3 = std::chrono::high_resolution_clock::now();
// To sort, replace building a std::set with sorting a std::vector
// Note: negative count gives desired ordering
// Aside: consider how this loop might be parallelised
vec_int_str_type myvec;
auto it = propvec.cbegin();
str_type name_last = it->second;
llil_int_type count = it->first;
for (++it; it != propvec.cend(); ++it) {
if ( it->second == name_last ) {
count += it->first;
}
else {
myvec.emplace_back( count, name_last );
name_last = it->second;
count = it->first;
}
}
myvec.emplace_back( count, name_last );
// As usual, stable_sort with a simpler sort function tends to be slower
#if USE_BOOST_PARALLEL_SORT == 0
#ifdef USE_TBB_L
tbb::parallel_sort(
#else
std::sort(
#endif
myvec.begin(), myvec.end(),
[](const int_str_type& left, const int_str_type& right) { return left.first != right.first ? left.first < right.first : left.second < right.second; }
// use this one for std::stable_sort
// [](const int_str_type& left, const int_str_type& right) { return left.first < right.first; }
);
#else
// block_indirect_sort seems to be the fastest
// Note: spinsort takes only 3 parameters (no nthrs parameter, unlike the other two)
boost::sort::block_indirect_sort(
// boost::sort::parallel_stable_sort(
// boost::sort::spinsort(
myvec.begin(), myvec.end(),
[](const int_str_type& left, const int_str_type& right) { return left.first != right.first ? left.first < right.first : left.second < right.second; }
// [](const int_str_type& left, const int_str_type& right) { return left.first < right.first; }
, nthrs
);
#endif
cend3s = std::chrono::high_resolution_clock::now();
// Note: fix up negative count via -n.first
#ifdef MAX_STR_LEN_L
for ( auto const& n : myvec )
::print(fast_io::concatln(std::string(n.second.data()), "\t", -n.first));
#else
for ( auto const& n : myvec )
::print(fast_io::concatln(n.second, "\t", -n.first));
#endif
cend3 = std::chrono::high_resolution_clock::now();
double ctaken = elaspe_time(cend3, cstart1);
double ctaken3s = elaspe_time(cend3s, cstart3);
double ctaken3o = elaspe_time(cend3, cend3s);
std::cerr << "vector stable sort time : " << ctaken3s << " secs\n";
std::cerr << "write stdout time : " << ctaken3o << " secs\n";
std::cerr << "total time : " << ctaken << " secs\n";
// Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly)
// std::this_thread::sleep_for(std::chrono::milliseconds(90000000));
return 0;
}
##
##
// llil4vec2.cpp
// See also: perlmonks.com, node_id=11149754
// llil4vec.cpp with a reduce_vec function.
// Based on: https://perlmonks.com/?node_id=11149545
// OpenMP Little Book - https://nanxiao.gitbooks.io/openmp-little-book/content/
//
// Vector version of llil2grt.pl.
// based on llil3vec.cpp https://perlmonks.com/?node_id=11149482
// 1. Run get_properties in parallel.
// 2. Capture time and diff via chrono.
// 3. Threads: flush local vector periodically.
// 4. Key words null-terminated for MAX_STR_LEN_L.
// 5. Concat strings using fast_io::concatln during output.
// 6. Support NUM_THREADS environment variable.
// 7. Add FLUSH_VECTOR_PERIODICALLY define statement.
// 8. Removed periodically flush, not what I expected.
// 9. Simplified code, similar to the tbb implementation.
// A. Capture time for get and sort properties separately.
// B. Added define for using boost's parallel sort.
// C. Replaced atoll with fast_atoll64.
// D. Fast vector sorting - from llil5vec-tbb.
// E. Reduce in-place, duplicate key names - tally count.
// F. Exit early if no work; fast_io tweak writing to output;
// fixword: ensure not more than MAX_LINE_LEN_L characters;
// limit to 8 threads max for sorting.
// G. Capture time for vector reduce separately.
// I. Improved get_properties; set precision for timings.
//
// Obtain the fast_io library (required dependency):
// git clone --depth=1 https://github.com/cppfastio/fast_io
//
// g++ compile on Linux: (boost header may need the -Wno-stringop-overflow gcc option)
// g++ -o llil4vec2 -std=c++20 -Wall -O3 llil4vec2.cpp -I ./fast_io/include
// g++ -o llil4vec2-omp -std=c++20 -fopenmp -Wall -O3 llil4vec2.cpp -I ./fast_io/include
//
// This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64)
// that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe).
//
// clang++ compile: same args, without the -Wno-stringop-overflow option
// Seems to run slightly faster when compiled with clang++ instead of g++
//
// An example compile on Ubuntu Linux with some 3rd party (header) libraries unpacked to $HOME:
// clang++ -o llil4vec2 -std=c++20 -fopenmp -Wall -O3
// -I "$HOME/local-fast_io/fast_io/include"
// -I "$HOME/local-parallel-hashmap/parallel-hashmap"
// -I "$HOME/local-boost/boost_1_81_0"
// llil4vec2.cpp
//
// Obtain gen-llil.pl and gen-long-llil.pl from https://perlmonks.com/?node_id=11148681
// perl gen-llil.pl big1.txt 200 3 1
// perl gen-llil.pl big2.txt 200 3 1
// perl gen-llil.pl big3.txt 200 3 1
// perl gen-long-llil.pl long1.txt 600
// perl gen-long-llil.pl long2.txt 600
// perl gen-long-llil.pl long3.txt 600
//
// To make random input, obtain shuffle.pl from https://perlmonks.com/?node_id=11149800
// perl shuffle.pl big1.txt >tmp && mv tmp big1.txt
// perl shuffle.pl big2.txt >tmp && mv tmp big2.txt
// perl shuffle.pl big3.txt >tmp && mv tmp big3.txt
//
// Example run: llil4vec2 big1.txt big2.txt big3.txt >out.txt
// NUM_THREADS=3 llil4vec2-omp ...
// ----------------------------------------------------------------------------
// Specify 0/1 to use boost's parallel sorting algorithm; faster than __gnu_parallel::sort.
// https://www.boost.org/doc/libs/1_81_0/libs/sort/doc/html/sort/parallel.html
// This requires the boost header files: e.g. devpkg-boost bundle on Clear Linux.
// Note: Another option is downloading and unpacking Boost locally.
// (no need to build it because the bits we use are header file only)
#define USE_BOOST_PARALLEL_SORT 1
#include
#include
// The fast_io header must come after chrono, else build error:
// "no member named 'concatln' in namespace 'fast_io'"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#if USE_BOOST_PARALLEL_SORT > 0
#include
#endif
#ifdef _OPENMP
#include
#endif
#include
#include
#include
#include
static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile");
// ----------------------------------------------------------------------------
typedef long long llil_int_type;
// Note: all words in big1.txt, big2.txt, big3.txt are <= 6 chars in length
// big.txt max word length is 6
// long.txt max word length is 208
// Based on rough benchmarking, the short fixed string hack below is only
// worth trying for MAX_STR_LEN_L up to about 22.
// See also https://backlinko.com/google-keyword-study
// Note: if input data contains words longer than MAX_STR_LEN_L
// the program may malfunction or even crash
// To use (limited length) fixed length strings uncomment the next line
#define MAX_STR_LEN_L 6
#ifdef MAX_STR_LEN_L
// Uncomment next line to use C memcmp function to compare fixed length strings
#define USE_MEMCMP_L 1
// using str_type = std::array;
using str_type = std::array;
#else
// using str_type = std::string;
using str_type = std::basic_string;
#endif
using int_str_type = std::pair;
using vec_int_str_type = std::vector;
// fast_atoll64
// https://stackoverflow.com/questions/16826422/
// c-most-efficient-way-to-convert-string-to-int-faster-than-atoi
// Commenting out done below because llil spec [id://11148465] states:
// each line must match : ^[a-z]+\t\d+$
// i.e. you may assume count >=0
inline int64_t fast_atoll64( const char* str )
{
int64_t val = 0;
// int sign = 0;
// if ( *str == '-' ) {
// sign = 1, ++str;
// }
uint8_t digit;
while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit;
// return sign ? -val : val;
return val;
}
// Mimic the Perl get_properties subroutine ----------------------------
// Limit line length and use ANSI C functions to try to boost performance
#define MAX_LINE_LEN_L 255
static void get_properties(
const char* fname, // in: the input file name
vec_int_str_type& vec_ret) // out: a vector of properties
{
FILE* fh;
std::array line;
char* found;
llil_int_type count;
fh = ::fopen(fname, "r");
if ( fh == NULL ) {
std::cerr << "Error opening '" << fname << "' : " << strerror(errno) << "\n";
return;
}
while ( ::fgets( line.data(), static_cast(MAX_LINE_LEN_L), fh ) != NULL ) {
found = std::find( line.begin(), line.end(), '\t' );
count = fast_atoll64(found+1);
#ifdef MAX_STR_LEN_L
str_type fixword {}; // Note: {} initializes all elements of fixword to '\0'
std::copy( line.begin(), found, fixword.begin() );
vec_ret.emplace_back( count, fixword );
#else
*found = '\0';
vec_ret.emplace_back( count, line.data() );
#endif
}
::fclose(fh);
}
// Reduce a vector range (tally adjacent count fields of duplicate key names)
// Return the reduced length
static vec_int_str_type::size_type reduce_vec(
vec_int_str_type::iterator it1, // range of vector elements to reduce
vec_int_str_type::iterator it2
)
{
auto itr = it1;
auto itw = it1;
llil_int_type count = itr->first;
str_type name_last = itr->second;
for ( ++itr; itr != it2; ++itr ) {
#ifdef USE_MEMCMP_L
if ( ::memcmp(itr->second.data(), name_last.data(), MAX_STR_LEN_L) == 0 ) {
#else
if ( itr->second == name_last ) {
#endif
count += itr->first;
}
else {
itw->first = count;
itw->second = name_last;
++itw;
count = itr->first;
name_last = itr->second;
}
}
itw->first = count;
itw->second = name_last;
return std::distance(it1, ++itw);
}
typedef std::chrono::high_resolution_clock high_resolution_clock;
typedef std::chrono::high_resolution_clock::time_point time_point;
typedef std::chrono::milliseconds milliseconds;
double elaspe_time(
time_point cend,
time_point cstart)
{
return double (
std::chrono::duration_cast(cend - cstart).count()
) * 1e-3;
}
// ---------------------------------------------------------------------
int main(int argc, char* argv[])
{
if (argc < 2) {
std::cerr << "usage: llil4vec2 file1 file2 ... >out.txt\n";
return 1;
}
std::cerr << std::setprecision(3) << std::setiosflags(std::ios::fixed);
#ifdef MAX_STR_LEN_L
#ifdef USE_MEMCMP_L
std::cerr << "llil4vec2 (fixed string length=" << MAX_STR_LEN_L << ") use memcmp start\n";
#else
std::cerr << "llil4vec2 (fixed string length=" << MAX_STR_LEN_L << ") start\n";
#endif
#else
std::cerr << "llil4vec2 start\n";
#endif
#ifdef _OPENMP
std::cerr << "use OpenMP\n";
#else
std::cerr << "don't use OpenMP\n";
#endif
#if USE_BOOST_PARALLEL_SORT == 0
std::cerr << "don't use boost sort\n";
#else
std::cerr << "use boost sort\n";
#endif
time_point cstart1, cend1, cstart2, cend2, cstart3, cend3r, cend3s, cend3;
cstart1 = std::chrono::high_resolution_clock::now();
#ifdef _OPENMP
// Determine the number of threads.
const char* env_nthrs = std::getenv("NUM_THREADS");
int nthrs = (env_nthrs && strlen(env_nthrs)) ? ::atoi(env_nthrs) : std::thread::hardware_concurrency();
omp_set_dynamic(false);
omp_set_num_threads(nthrs);
#else
int nthrs = 1;
#endif
int nthrs_sort = ( std::thread::hardware_concurrency() < 12 )
? std::thread::hardware_concurrency()
: 12;
// Get the list of input files from the command line
int nfiles = argc - 1;
char** fname = &argv[1];
// Create the vector of properties
vec_int_str_type propvec;
// Run parallel, depending on the number of threads
if ( nthrs == 1 || nfiles == 1 ) {
for (int i = 0; i < nfiles; ++i)
get_properties( fname[i], propvec );
}
#ifdef _OPENMP
else {
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < nfiles; ++i) {
vec_int_str_type locvec;
get_properties( fname[i], locvec );
#pragma omp critical
{
// Append local vector to propvec
propvec.insert( propvec.end(), locvec.begin(), locvec.end() );
}
}
}
#endif
if (!propvec.size()) {
std::cerr << "No work, exiting...\n";
return 1;
}
cend1 = std::chrono::high_resolution_clock::now();
double ctaken1 = elaspe_time(cend1, cstart1);
std::cerr << "get properties time : " << std::setw(8) << ctaken1 << " secs\n";
cstart2 = std::chrono::high_resolution_clock::now();
// Needs to be sorted by word for later sum of adjacent count fields to work
#if USE_BOOST_PARALLEL_SORT == 0
std::sort(
propvec.begin(), propvec.end(),
[](const int_str_type& left, const int_str_type& right) {
return
#ifdef USE_MEMCMP_L
::memcmp(left.second.data(), right.second.data(), MAX_STR_LEN_L) < 0;
#else
left.second < right.second;
#endif
}
);
#else
boost::sort::block_indirect_sort(
propvec.begin(), propvec.end(),
[](const int_str_type& left, const int_str_type& right) {
return
#ifdef USE_MEMCMP_L
::memcmp(left.second.data(), right.second.data(), MAX_STR_LEN_L) < 0;
#else
left.second < right.second;
#endif
},
nthrs_sort
);
#endif
cend2 = std::chrono::high_resolution_clock::now();
double ctaken2 = elaspe_time(cend2, cstart2);
std::cerr << "sort properties time : " << std::setw(8) << ctaken2 << " secs\n";
cstart3 = std::chrono::high_resolution_clock::now();
// Reduce in-place (tally adjacent count fields of duplicate key names)
vec_int_str_type::size_type newsize = reduce_vec( propvec.begin(), propvec.end() );
propvec.resize(newsize);
cend3r = std::chrono::high_resolution_clock::now();
// Sort the vector by (count) in reverse order, (name) in lexical order
#if USE_BOOST_PARALLEL_SORT == 0
std::sort(
// Standard sort
propvec.begin(), propvec.end(),
[](const int_str_type& left, const int_str_type& right) {
#ifdef USE_MEMCMP_L
return left.first != right.first
? left.first > right.first
: ::memcmp(left.second.data(), right.second.data(), MAX_STR_LEN_L) < 0;
#else
return left.first != right.first
? left.first > right.first
: left.second < right.second;
#endif
}
);
#else
boost::sort::block_indirect_sort(
// Parallel sort
propvec.begin(), propvec.end(),
[](const int_str_type& left, const int_str_type& right) {
#ifdef USE_MEMCMP_L
return left.first != right.first
? left.first > right.first
: ::memcmp(left.second.data(), right.second.data(), MAX_STR_LEN_L) < 0;
#else
return left.first != right.first
? left.first > right.first
: left.second < right.second;
#endif
},
nthrs_sort
);
#endif
cend3s = std::chrono::high_resolution_clock::now();
// Output the sorted vector
for ( auto const& n : propvec ) {
#ifdef MAX_STR_LEN_L
// Note: finding doco on mnp::os_c_str() is a challenge, but testing shows this
// prints non null-terminated strings of length MAX_STR_LEN_L correctly
// ::print(fast_io::concatln(fast_io::mnp::os_c_str(n.second.data(), MAX_STR_LEN_L), "\t", n.first));
// ::println(fast_io::mnp::os_c_str(n.second.data(), MAX_STR_LEN_L), "\t", n.first);
fast_io::io::println(fast_io::mnp::os_c_str(n.second.data(), MAX_STR_LEN_L), "\t", n.first);
#else
// ::print(fast_io::concatln(n.second, "\t", n.first));
// ::println(n.second, "\t", n.first);
fast_io::io::println(n.second, "\t", n.first);
#endif
}
cend3 = std::chrono::high_resolution_clock::now();
double ctaken = elaspe_time(cend3, cstart1);
double ctaken3r = elaspe_time(cend3r, cstart3);
double ctaken3s = elaspe_time(cend3s, cend3r);
double ctaken3o = elaspe_time(cend3, cend3s);
std::cerr << "vector reduce time : " << std::setw(8) << ctaken3r << " secs\n";
std::cerr << "vector stable sort time : " << std::setw(8) << ctaken3s << " secs\n";
std::cerr << "write stdout time : " << std::setw(8) << ctaken3o << " secs\n";
std::cerr << "total time : " << std::setw(8) << ctaken << " secs\n";
// Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly)
// std::this_thread::sleep_for(std::chrono::milliseconds(90000000));
return 0;
}