in reply to Re^11: Risque Romantic Rosetta Roman Race - All in One - OpenMP
in thread Risque Romantic Rosetta Roman Race

I wanted to come back and provide a MCE-like chunking variant, for computing Roman Numerals to Decimal. It runs faster than the memory mapping solutions consuming 8 or more threads.

fast_io memory mapping:

$ NUM_THREADS=1 ./rtoa-pgatram-allinone2b t1.txt t1.txt t1.txt t1.txt + | cksum do_it_all time : 0.498 secs 737201628 75552000 $ NUM_THREADS=4 ./rtoa-pgatram-allinone2b t1.txt t1.txt t1.txt t1.txt + | cksum do_it_all time : 0.176 secs 737201628 75552000 $ NUM_THREADS=8 ./rtoa-pgatram-allinone2b t1.txt t1.txt t1.txt t1.txt + | cksum do_it_all time : 0.124 secs 737201628 75552000 $ NUM_THREADS=16 ./rtoa-pgatram-allinone2b t1.txt t1.txt t1.txt t1.txt + | cksum do_it_all time : 0.096 secs 737201628 75552000

portable memory mapping:

$ NUM_THREADS=1 ./rtoa-pgatram-allinone2c t1.txt t1.txt t1.txt t1.txt + | cksum do_it_all time : 0.488 secs 737201628 75552000 $ NUM_THREADS=4 ./rtoa-pgatram-allinone2c t1.txt t1.txt t1.txt t1.txt + | cksum do_it_all time : 0.143 secs 737201628 75552000 $ NUM_THREADS=8 ./rtoa-pgatram-allinone2c t1.txt t1.txt t1.txt t1.txt + | cksum do_it_all time : 0.091 secs 737201628 75552000 $ NUM_THREADS=16 ./rtoa-pgatram-allinone2c t1.txt t1.txt t1.txt t1.txt + | cksum do_it_all time : 0.065 secs 737201628 75552000

MCE-like chunking:

$ NUM_THREADS=1 ./rtoa-pgatram-allinone2d t1.txt t1.txt t1.txt t1.txt + | cksum do_it_all time : 0.489 secs 737201628 75552000 $ NUM_THREADS=4 ./rtoa-pgatram-allinone2d t1.txt t1.txt t1.txt t1.txt + | cksum do_it_all time : 0.144 secs 737201628 75552000 $ NUM_THREADS=8 ./rtoa-pgatram-allinone2d t1.txt t1.txt t1.txt t1.txt + | cksum do_it_all time : 0.075 secs 737201628 75552000 $ NUM_THREADS=16 ./rtoa-pgatram-allinone2d t1.txt t1.txt t1.txt t1.txt + | cksum do_it_all time : 0.048 secs 737201628 75552000

rtoa-pgatram-allinone2d.cpp

// rtoa-pgatram-allinone2d.cpp. Crude allinone version. // based on rtoa-pgatram-allinone2.cpp https://perlmonks.org/?node_i +d=11152186 // // Obtain the fast_io library (required dependency): // git clone --depth=1 https://github.com/cppfastio/fast_io // // Compile with g++ or clang++: // clang++ -o rtoa-pgatram-allinone2d -std=c++20 -fopenmp -Wall -O3 +-I fast_io/include rtoa-pgatram-allinone2d.cpp // // OpenMP Little Book: // https://nanxiao.gitbooks.io/openmp-little-book/content/ #include <chrono> #include <cstring> #include <string> #include <string_view> #include <numeric> #include <thread> #ifdef _OPENMP #include <omp.h> #endif #include <fstream> #include <iostream> #include <iomanip> #include <semaphore> // See [id://11149504] for more info on the fast_io C++ library #include <fast_io.h> #include <fast_io_legacy.h> // --------------------------------------------------------------- typedef std::chrono::high_resolution_clock high_resolution_clock; typedef std::chrono::high_resolution_clock::time_point time_point; typedef std::chrono::milliseconds milliseconds; double elaspe_time( time_point cend, time_point cstart) { return double ( std::chrono::duration_cast<milliseconds>(cend - cstart).count() ) * 1e-3; } // --------------------------------------------------------------- // Though there are less than 256 initializers in this ascii table, // the others are guaranteed by ANSI C to be initialized to zero. static const int romtab[256] = { 0,0,0,0,0,0, 0, 0, 0, 0, // 00- 09 0,0,0,0,0,0, 0, 0, 0, 0, // 10- 19 0,0,0,0,0,0, 0, 0, 0, 0, // 20- 29 0,0,0,0,0,0, 0, 0, 0, 0, // 30- 39 0,0,0,0,0,0, 0, 0, 0, 0, // 40- 49 0,0,0,0,0,0, 0, 0, 0, 0, // 50- 59 0,0,0,0,0,0, 0, 100, 500, 0, // 60- 69 0,0,0,1,0,0, 50,1000, 0, 0, // 70- 79 0,0,0,0,0,0, 5, 0, 10, 0, // 80- 89 0,0,0,0,0,0, 0, 0, 0, 100, // 90- 99 500,0,0,0,0,1, 0, 0, 50,1000, // 100-109 0,0,0,0,0,0, 0, 0, 5, 0, // 110-119 10,0,0,0,0,0, 0, 0, 0, 0 // 120-129 }; // Return the arabic number for a roman letter c. // Return zero if the roman letter c is invalid. inline int urtoa(int c) { return romtab[c]; } inline int accfn(int t, char c) { return t + urtoa(c) - t % urtoa(c) * 2; } inline int roman_to_dec(std::string_view st) { return std::accumulate(st.begin(), st.end(), 0, accfn); } // --------------------------------------------------------------- inline constexpr auto CHUNK_SIZE = 1048576; inline constexpr auto LINE_LENGTH = 255; // Helper function to find '\n'. inline constexpr char const* find_lf(char const* first, char const* la +st) { while (first != last) { if (*first == '\n') break; ++first; } return first; } // Read an input file of Roman Numerals and do it all. static void do_it_all( std::ifstream& fin, // in: file input stream containing a list + of Roman Numerals int nthds // in: number of threads ) { fast_io::out_buf_type obf{fast_io::out()}; std::binary_semaphore *sem[nthds]; size_t next_chunk_id = 0; // Create semaphores for orderly output. for (int i = 0; i < nthds; ++i) sem[i] = new std::binary_semaphore{0}; #pragma omp parallel { std::string buf; buf.resize(CHUNK_SIZE + LINE_LENGTH + 1, '\0'); const char *first, *last; size_t chunk_id, len; while (fin.good()) { std::string output; len = 0; // Read the next chunk serially. // #pragma omp critical { fin.read(&buf[0], CHUNK_SIZE); if ((len = fin.gcount()) > 0) { chunk_id = ++next_chunk_id; if (buf[len - 1] != '\n' && fin.getline(&buf[len], LINE +_LENGTH)) { // Getline discards the newline char and appends nul +l char. // Therefore, change '\0' to '\n'. len += fin.gcount(); buf[len - 1] = '\n'; } } } if (!len) break; buf[len] = '\0'; first = &buf[0]; last = &buf[len]; // Process max Nthreads chunks concurrently. // while (first < last) { auto beg_ptr{first}; first = find_lf(first, last); auto end_ptr{first}; int dec = roman_to_dec(std::string_view(beg_ptr, end_ptr - + beg_ptr)); output.append(fast_io::concatln(dec)); ++first; } // Output completed chunk, orderly by chunk_id. // if (nthds > 1 && chunk_id > 1) sem[chunk_id % nthds]->acquire(); fast_io::io::print(obf, output); if (nthds > 1) sem[(chunk_id + 1) % nthds]->release(); } } // Destroy the dynamically allocated semaphores. for (int i = 0; i < nthds; ++i) delete sem[i]; } int main(int argc, char* argv[]) { if (argc < 2) { if (argc > 0) std::cerr << "Usage: rtoa-pgatram-allinone2d file... >out.txt +\n"; return 1; } std::cerr << std::setprecision(3) << std::setiosflags(std::ios::fix +ed); time_point cstartall, cendall; cstartall = high_resolution_clock::now(); #ifdef _OPENMP // Determine the number of threads. int nthds = std::thread::hardware_concurrency(); const char* env_nthds1 = std::getenv("OMP_NUM_THREADS"); const char* env_nthds2 = std::getenv("NUM_THREADS"); if (env_nthds1 && strlen(env_nthds1)) nthds = ::atoi(env_nthds1); else if (env_nthds2 && strlen(env_nthds2)) nthds = ::atoi(env_nthds2); omp_set_dynamic(false); omp_set_num_threads(nthds); #else int nthds = 1; #endif int nfiles = argc - 1; char** fname = &argv[1]; for (int i = 0; i < nfiles; ++i) { std::ifstream fin(fname[i], std::ifstream::binary); if (!fin.is_open()) { std::cerr << "Error opening '" << fname[i] << "' : " << strer +ror(errno) << '\n'; continue; } do_it_all(fin, nthds); fin.close(); } cendall = high_resolution_clock::now(); double ctakenall = elaspe_time(cendall, cstartall); std::cerr << "do_it_all time : " << std::setw(8) << ctakenall << + " secs\n"; return 0; }