# llil2.pl # Example run: perl llil2.pl tt1.txt tt2.txt tt3.txt >out.txt use strict; use warnings; # ---------------------------------------------------------------------- # LLiL specification # ------------------ # A LLiL-format file is a text file. # Each line consists of a lowercase name a TAB character and a non-negative integer count. # That is, each line must match : ^[a-z]+\t\d+$ # For example, reading the LLiL-format files, tt1.txt containing: # camel\t42 # pearl\t94 # dromedary\t69 # and tt2.txt containing: # camel\t8 # hello\t12345 # dromedary\t1 # returns this hashref: # $hash_ret{"camel"} = 50 # $hash_ret{"dromedary"} = 70 # $hash_ret{"hello"} = 12345 # $hash_ret{"pearl"} = 94 # That is, values are added for items with the same key. # # To get the required LLiL text, you must sort the returned hashref # descending by value and insert a TAB separator: # hello\t12345 # pearl\t94 # dromedary\t70 # camel\t50 # To make testing via diff easier, we further sort ascending by name # for lines with the same value. # ---------------------------------------------------------------------- # Function get_properties # Read a list of LLiL-format files # Return a reference to a hash of properties sub get_properties { my $files = shift; # in: reference to a list of LLiL-format files my %hash_ret; # out: reference to a hash of properties for my $fname ( @{$files} ) { open( my $fh, '<', $fname ) or die "error: open '$fname': $!"; while (<$fh>) { chomp; my ($word, $count) = split /\t/; $hash_ret{$word} += $count; } close($fh) or die "error: close '$fname': $!"; } return \%hash_ret; } # ----------------- mainline ------------------------------------------- @ARGV or die "usage: $0 file...\n"; my @llil_files = @ARGV; warn "llil2 start\n"; my $tstart1 = time; my $href = get_properties( \@llil_files ); my $tend1 = time; my $taken1 = $tend1 - $tstart1; warn "get_properties : $taken1 secs\n"; my $tstart2 = time; # Using two sorts is waaay faster than one in Perl for some reason! (see [id://11148545]) for my $key ( sort { $href->{$b} <=> $href->{$a} } sort keys %{$href} ) { print "$key\t$href->{$key}\n"; } my $tend2 = time; my $taken2 = $tend2 - $tstart2; my $taken = $tend2 - $tstart1; warn "sort + output : $taken2 secs\n"; warn "total : $taken secs\n"; #### // llil2.cpp. C++ 11 version of Perl llil.pl. // llil2.cpp is faster than llil.cpp while also clarifying limits: // - all keys should be less than 200 or so characters in length // - numbers are 64 bit integers (max: 9,223,372,036,854,775,807) // g++ compile on Linux: // g++ -o llil2 -std=c++11 -Wall -O3 llil2.cpp // This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64) // that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe). // Example run: llil2 tt1.txt tt2.txt tt3.txt >out.txt #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static_assert(sizeof(size_t) == sizeof(int64_t), "size_t too small, need a 64-bit compile"); // ---------------------------------------------------------------------------- // Crude hack to see Windows Private Bytes in Task Manager by sleeping at // program end (see also sleep hack at end of main) // #include // #include // For some performance hacks to speed up C++ I/O see: // https://www.reddit.com/r/rust/comments/9xedap/how_to_achieve_fast_stdinstdout_io_suitable_for/ // The only one we use here is to prefer "\n" to std::endl to reduce stdout flushing // ---------------------------------------------------------------------------- typedef long long llil_int_type; using str_int_type = std::pair; using map_str_int_type = std::map; using vec_str_int_type = std::vector; // Mimic the Perl get_properties subroutine ---------------------------- // Limit line length and use lower level ANSI C functions to try to boost I/O performance // TODO (maybe): // - reading: Try ::setvbuf(fh, NULL, _IOFBF, 65536) or some such on input files // - writing: Try ::setvbuf(stdout, stdout_buf, _IOFBF, sizeof(stdout_buf)) on stdout // ... or instead of writing to stdout, take an output file as a program argument #define MAX_LINE_LEN_L 255 static void get_properties( int nfiles, // in: the number of input files char* fname[], // in: the input file names map_str_int_type& hash_ret) // out: a hash of properties { FILE* fh; char line[MAX_LINE_LEN_L+1]; char* word; char* count; for (int i = 0; i < nfiles; ++i) { fh = ::fopen(fname[i], "r"); if (fh == NULL) { std::cerr << "Error opening '" << fname[i] << "'\n"; return; } while ( ::fgets(line, MAX_LINE_LEN_L, fh) != NULL ) { word = ::strtok(line, "\t"); count = ::strtok(NULL, "\n"); hash_ret[word] += ::atoll(count); } ::fclose(fh); } } // --------------------------------------------------------------------- int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: llil2 file1 file2 ... >out.txt\n"; return 1; } std::cerr << "llil2 start\n"; time_t tstart1 = ::time(NULL); // Create the hash of properties map_str_int_type hash_ret; get_properties(argc - 1, &argv[1], hash_ret); time_t tend1 = ::time(NULL); long taken1 = static_cast(::difftime(tend1, tstart1) + 0.5); std::cerr << "get_properties : " << taken1 << " secs\n"; // Sort descending by value, i.e. mimic this Perl code in C++: // sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href} time_t tstart2 = ::time(NULL); vec_str_int_type v( hash_ret.begin(), hash_ret.end() ); std::sort( v.begin(), v.end(), [](const str_int_type& left, const str_int_type& right) { return right.second != left.second ? right.second < left.second : left.first < right.first; } ); // Output the merged properties for ( auto const& n : v ) { std::cout << n.first << '\t' << n.second << '\n'; } time_t tend2 = ::time(NULL); long taken2 = static_cast(::difftime(tend2, tstart2) + 0.5); long taken = static_cast(::difftime(tend2, tstart1) + 0.5); std::cerr << "sort + output : " << taken2 << " secs\n"; std::cerr << "total : " << taken << " secs\n"; // Hack to see Private Bytes in Windows Task Manager (uncomment next line so process doesn't exit too quickly) // std::this_thread::sleep_for(std::chrono::milliseconds(90000000)); return 0; } #### get_properties : 11 secs sort + output : 74 secs total : 85 secs #### get_properties : 11 secs sort + output : 25 secs total : 36 secs #### get_properties : 10 secs sort + output : 20 secs total : 30 secs #### get_properties : 9 secs sort + output : 7 secs total : 16 secs #### get_properties : 6 secs sort + output : 6 secs total : 12 secs #### using map_str_int_type = std::unordered_map; // to (llil2a.cpp): using map_str_int_type = std::map; #### get_properties : 4 secs sort + output : 5 secs total : 9 secs #### sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href} #### sort { $href->{$b} <=> $href->{$a} } sort keys %{$href} )