LC_ALL=C sort -k1 big{1,2,3}.txt | awk -f tally-count.awk | LC_ALL=C sort -k2nr >out.txt real time: 12.671 seconds LC_ALL=C parsort -k1 big{1,2,3}.txt | awk -f tally-count.awk | LC_ALL=C parsort -k2nr >out.txt real time: 6.561 seconds LC_ALL=C parsort -k1 big{1,2,3}.txt | ./tally-count | LC_ALL=C parsort -k2nr >out.txt real time: 3.615 seconds #### LC_ALL=C parsort -k1 big{1,2,3}.txt | awk -f tally-count.awk | LC_ALL=C parsort -k2nr >out.txt real time: 6.520 seconds LC_ALL=C parsort -k1 big{1,2,3}.txt | ./tally-count | LC_ALL=C parsort -k2nr >out.txt real time: 2.398 seconds #### cat big{1,2,3}.txt | ./linux-q -tT "select c1, sum(c2) from - group by c1 order by sum(c2) desc, c1" >out.txt real time: 84.027 seconds ./linux-q -tT "select c1, sum(c2) from big[1-3].txt group by c1 order by sum(c2) desc, c1" >out.txt real time: 89.539 seconds #### # Tally adjacent count fields of duplicate key names. BEGIN { OFS = FS = "\t" key = "" sum = 0 flg = 1 } { if ( flg ) { flg = 0 key = $1 } if ( $1 == key ) { sum += $2 } else { print key, sum key = $1 sum = $2 } } END { print key, sum } #### // ---------------------------------------------------------------------------- // tally-count.cpp // Tally adjacent count fields of duplicate key names. // // Obtain the fast_io library (required dependency): // git clone --depth=1 https://github.com/cppfastio/fast_io // // clang++ compile on Linux: // clang++ -o tally-count -std=c++20 -Wall -O3 tally-count.cpp // // Example run: // LC_ALL=C parsort -k1 big{1,2,3}.txt | ./tally-count |\ // LC_ALL=C parsort -k2nr > out.txt // ---------------------------------------------------------------------------- #include // The fast_io header must come after chrono, else build error: // "no member named 'concatln' in namespace 'fast_io'" #include "fast_io/include/fast_io.h" #include #include #include #include #include #include #include // fast_atoll64 // https://stackoverflow.com/questions/16826422/ // c-most-efficient-way-to-convert-string-to-int-faster-than-atoi inline int64_t fast_atoll64( const char* str ) { int64_t val = 0; int sign = 0; if ( *str == '-' ) { sign = 1, ++str; } uint8_t digit; while ((digit = uint8_t(*str++ - '0')) <= 9) val = val * 10 + digit; return sign ? -val : val; } #define MAX_LINE_LEN_L 255 int main(int argc, char* argv[]) { std::array line, name; char* found; long long count, sum; int flag = 0; // obtain the first key-value pair delimited by tab while ( ::fgets( line.data(), static_cast(MAX_LINE_LEN_L), ::stdin ) != NULL ) { if ( ( found = std::find( line.begin(), line.end(), '\t' ) ) == line.end() ) continue; sum = fast_atoll64( found + 1 ); *found = '\0'; // key name ::memcpy( name.data(), line.data(), found - line.data() + 1 ); flag = 1; break; } // process the rest of standard input while ( ::fgets( line.data(), static_cast(MAX_LINE_LEN_L), ::stdin ) != NULL ) { if ( ( found = std::find( line.begin(), line.end(), '\t' ) ) == line.end() ) continue; count = fast_atoll64( found + 1 ); *found = '\0'; // key name if ( ! ::strcmp( line.data(), name.data() ) ) { sum += count; } else { fast_io::io::println( fast_io::mnp::os_c_str(name.data()), "\t", sum ); ::memcpy( name.data(), line.data(), found - line.data() + 1 ); sum = count; } } if ( flag ) fast_io::io::println( fast_io::mnp::os_c_str(name.data()), "\t", sum ); return 0; }