> It now takes the entire CPU (64 logical threads) for Perl MCE 1.0 to run faster. :)
Now that's a challenge! Can I can push the dial further? ... or will the ingenious tybalt89's
unorthodox assistance from the side allow you to move the needle back towards 32? :)
Since I know how much you enjoyed my (anonymonk-provoked) MAX_STR_LEN_L hack in the long Long List is Long series,
I've tried a similar stunt here in a desperate attempt to improve data locality and cache performance.
I also added a (cheating) vector reserve and
the total time at the end (thanks for pointing out this oversight).
Anyways, here are the timings of my latest version, rtoa-pgatram-fixed.cpp, using the fast_io library:
$ ./rtoa-pgatram-fixed t1.txt >f.tmp
read_input_files : 3999000 items
read file time : 0.164 secs
roman_to_dec time : 0.088 secs
output time : 0.100 secs
total time : 0.353 secs
$ diff f.tmp roman.tmp
$ ./rtoa-pgatram-fixed t1.txt >f.tmp
read_input_files : 3999000 items
read file time : 0.167 secs
roman_to_dec time : 0.086 secs
output time : 0.098 secs
total time : 0.353 secs
$ diff f.tmp roman.tmp
$ ./rtoa-pgatram-fixed t1.txt >f.tmp
read_input_files : 3999000 items
read file time : 0.173 secs
roman_to_dec time : 0.086 secs
output time : 0.093 secs
total time : 0.353 secs
$ diff f.tmp roman.tmp
Update: with marioroy rtoa-pgatram-fixed2 below (without fast_io):
$ ./rtoa-pgatram-fixed2 t1.txt >f.tmp
read_input_files : 3999000 items
read file time : 0.189 secs
roman_to_dec time : 0.371 secs
total time : 0.560 secs
$ diff f.tmp roman.tmp
... with fast_io:
$ ./rtoa-pgatram-fixed2 t1.txt >f.tmp
read_input_files : 3999000 items
read file time : 0.178 secs
roman_to_dec time : 0.143 secs
total time : 0.322 secs
$ diff f.tmp roman.tmp
rtoa-pgatram-fixed.cpp
// rtoa-pgatram-fixed.cpp. Crude fixed length string version.
// Compile with:
// g++ -o rtoa-pgatram-fixed -std=c++20 -Wall -O3 rtoa-pgatram-fixed
+.cpp
// or:
// clang++ -o rtoa-pgatram-fixed -std=c++20 -Wall -O3 rtoa-pgatram-f
+ixed.cpp
// or:
// g++ -o rtoa-pgatram-fixed -std=c++20 -Wall -O3 -I "$HOME/local-fa
+st_io/fast_io/include" rtoa-pgatram-fixed.cpp
// to use the locally installed fast_io header-only library
#include <cctype>
#include <cstring>
#include <iostream>
#include <string>
#include <vector>
#include <numeric>
#include <chrono>
#include <iomanip>
// See [id://11149504] for more info on the fast_io C++ library
// #include <fast_io.h>
// -------------------------------------------------------------------
+---------
typedef std::chrono::high_resolution_clock high_resolution_clock;
typedef std::chrono::high_resolution_clock::time_point time_point;
typedef std::chrono::milliseconds milliseconds;
double elaspe_time(
time_point cend,
time_point cstart)
{
return double (
std::chrono::duration_cast<milliseconds>(cend - cstart).count()
) * 1e-3;
}
// -------------------------------------------------------------------
+---------
// Longest roman numeral is MMMDCCCLXXXVIII (3888) of length 15
// XXX: I'm off by one somewhere because 3888 fails with
// MAX_STR_LEN_L of 16 but works with 17
#define MAX_STR_LEN_L 17
// The basic idea is to keep this struct small and without pointers to
// improve data locality/cache performance when traversing the vector
struct str_type {
char slen;
char str[MAX_STR_LEN_L];
};
using vec_str_type = std::vector<str_type>;
using vec_int_type = std::vector<int>;
// Read an input file of Roman Numerals and append them to a list
// Return the number of Roman Numerals appended
static int read_input_file(
const char* fname, // in: file name containing a list of
+Roman Numerals
vec_str_type& vec_ret) // out: a vector of Roman Numeral strin
+gs
{
FILE* fh;
str_type line;
int cnt = 0;
fh = ::fopen(fname, "r");
if ( fh == NULL ) {
std::cerr << "Error opening '" << fname << "' : " << strerror(er
+rno) << "\n";
return 0;
}
while ( ::fgets( line.str, MAX_STR_LEN_L, fh ) != NULL ) {
line.slen = ::strlen(line.str) - 1; // -1 to strip trailing n
+ewline
vec_ret.emplace_back(line);
++cnt;
}
::fclose(fh);
return cnt;
}
// ---------------------------------------------------------------
// Though there are less than 256 initializers in this ascii table,
// the others are guaranteed by ANSI C to be initialized to zero.
static const int romtab[256] = {
0,0,0,0,0,0, 0, 0, 0, 0, // 00- 09
0,0,0,0,0,0, 0, 0, 0, 0, // 10- 19
0,0,0,0,0,0, 0, 0, 0, 0, // 20- 29
0,0,0,0,0,0, 0, 0, 0, 0, // 30- 39
0,0,0,0,0,0, 0, 0, 0, 0, // 40- 49
0,0,0,0,0,0, 0, 0, 0, 0, // 50- 59
0,0,0,0,0,0, 0, 100, 500, 0, // 60- 69
0,0,0,1,0,0, 50,1000, 0, 0, // 70- 79
0,0,0,0,0,0, 5, 0, 10, 0, // 80- 89
0,0,0,0,0,0, 0, 0, 0, 100, // 90- 99
500,0,0,0,0,1, 0, 0, 50,1000, // 100-109
0,0,0,0,0,0, 0, 0, 5, 0, // 110-119
10,0,0,0,0,0, 0, 0, 0, 0 // 120-129
};
// Return the arabic number for a roman letter c.
// Return zero if the roman letter c is invalid.
inline int urtoa(int c) { return romtab[c]; }
inline int accfn(int t, char c) {
return t + urtoa(c) - t % urtoa(c) * 2;
}
inline int roman_to_dec(const str_type& st) {
return std::accumulate( st.str, st.str + st.slen, 0, accfn );
}
int main(int argc, char* argv[])
{
if (argc < 2) {
std::cerr << "usage: rtoa-pgatram-fixed file...\n";
return 1;
}
// Get the list of input files from the command line
int nfiles = argc - 1;
char** fname = &argv[1];
std::cerr << std::setprecision(3) << std::setiosflags(std::ios::fix
+ed);
time_point cstart1, cend1, cstart2, cend2, cstart3, cend3;
// Read the input files into roman_list
vec_str_type roman_list;
roman_list.reserve(3999000);
cstart1 = high_resolution_clock::now();
int cnt = 0;
for (int i = 0; i < nfiles; ++i) {
cnt += read_input_file( fname[i], roman_list );
}
cend1 = high_resolution_clock::now();
double ctaken1 = elaspe_time(cend1, cstart1);
std::cerr << "read_input_files : " << cnt << " items\n";
std::cerr << "read file time : " << std::setw(8) << ctaken1 << "
+ secs\n";
// Convert roman to decimal
vec_int_type arabic_list;
arabic_list.reserve(3999000);
cstart2 = high_resolution_clock::now();
for ( auto const& r : roman_list ) {
arabic_list.emplace_back( roman_to_dec(r) );
}
cend2 = high_resolution_clock::now();
double ctaken2 = elaspe_time(cend2, cstart2);
std::cerr << "roman_to_dec time : " << std::setw(8) << ctaken2 << "
+ secs\n";
// Output to stdout
cstart3 = high_resolution_clock::now();
for ( auto const& i : arabic_list ) {
std::cout << i << '\n';
// fast_io::io::println(i);
}
cend3 = high_resolution_clock::now();
double ctaken3 = elaspe_time(cend3, cstart3);
std::cerr << "output time : " << std::setw(8) << ctaken3 << "
+ secs\n";
double ctaken = elaspe_time(cend3, cstart1);
std::cerr << "total time : " << std::setw(8) << ctaken <<
+" secs\n";
return 0;
}
|