# llil.pl # Example run: perl llil.pl tt1.txt tt2.txt >oo1.tmp use strict; use warnings; # ---------------------------------------------------------------------- # LLiL specification # ------------------ # A LLiL-format file is a text file. # Each line consists of a lowercase name a TAB character and a non-negative integer count. # That is, each line must match : ^[a-z]+\t\d+$ # For example, reading the LLiL-format files, tt1.txt containing: # camel\t42 # pearl\t94 # dromedary\t69 # and tt2.txt containing: # camel\t8 # hello\t12345 # dromedary\t1 # returns this hashref: # $hash_ret{"camel"} = 50 # $hash_ret{"dromedary"} = 70 # $hash_ret{"hello"} = 12345 # $hash_ret{"pearl"} = 94 # That is, values are added for items with the same key. # # To get the required LLiL text, you must sort the returned hashref # descending by value and insert a TAB separator: # hello\t12345 # pearl\t94 # dromedary\t70 # camel\t50 # To make testing via diff easier, we further sort ascending by name # for lines with the same value. # ---------------------------------------------------------------------- # Function get_properties # Read a list of LLiL-format files # Return a reference to a hash of properties sub get_properties { my $files = shift; # in: reference to a list of LLiL-format files my %hash_ret; # out: reference to a hash of properties for my $fname ( @{$files} ) { open( my $fh, '<', $fname ) or die "error: open '$fname': $!"; while (<$fh>) { chomp; my ($word, $count) = split /\t/; $hash_ret{$word} += $count; } close($fh) or die "error: close '$fname': $!"; } return \%hash_ret; } # ----------------- mainline ------------------------------------------- @ARGV or die "usage: $0 file...\n"; my @llil_files = @ARGV; warn "llil start\n"; my $tstart1 = time; my $href = get_properties( \@llil_files ); my $tend1 = time; my $taken1 = $tend1 - $tstart1; warn "get_properties : $taken1 secs\n"; my $tstart2 = time; for my $key ( sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href} ) { print "$key\t$href->{$key}\n"; } my $tend2 = time; my $taken2 = $tend2 - $tstart2; my $taken = $tend2 - $tstart1; warn "sort + output : $taken2 secs\n"; warn "total : $taken secs\n"; #### sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href} #### // llil.cpp. C++ 11 version of Perl llil.pl. // g++ compile on Linux: // g++ -o llil -std=c++11 -Wall -O3 llil.cpp // This g++ command also works with mingw C++ compiler (https://sourceforge.net/projects/mingw-w64) // that comes bundled with Strawberry Perl (C:\Strawberry\c\bin\g++.exe). // Example run: llil tt1.txt tt2.txt >out.txt // Uncomment next line to sort by creating a multimap (instead of via the sort function) // #define LLIL_SORT_VIA_MULTIMAP_L 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // ------------------------------------------------------------ // Performance hacks to speed up IO. // See https://www.reddit.com/r/rust/comments/9xedap/how_to_achieve_fast_stdinstdout_io_suitable_for/ // Avoid flush by using "\n" instead of std::endl (this made a big difference in this program!) // This one made almost no difference in this program: // const auto io_speed_up =[](){ // std::ios::sync_with_stdio(false); // std::cin.tie(nullptr); // return nullptr; // }(); // ------------------------------------------------------------ using str_int_type = std::pair; using map_str_int_type = std::unordered_map; using vec_str_int_type = std::vector; // Mimic the Perl get_properties subroutine static void get_properties( int nfiles, // in: the number of input files char* fname[], // in: the input file names map_str_int_type& hash_ret) // out: a hash of properties { for (int i = 0; i < nfiles; ++i) { std::ifstream llfile(fname[i]); if (!llfile) { std::cerr << "Error opening '" << fname[i] << "'\n"; return; } for (std::string line; std::getline(llfile, line); ) { std::string word, count; std::stringstream ssline(line); std::getline(ssline, word, '\t'); std::getline(ssline, count); hash_ret[word] += std::stoi(count); } } } #ifdef LLIL_SORT_VIA_MULTIMAP_L // Convert a std::unordered_map to a std::multimap static std::multimap invert_map(const std::unordered_map& m) { std::multimap mm; for (std::unordered_map::const_iterator it = m.begin(); it != m.end(); ++it) { mm.insert( std::make_pair(it->second, it->first) ); } return mm; } #endif int main(int argc, char* argv[]) { if (argc < 2) { std::cerr << "usage: llil file1 file2 ... >out.txt\n"; return 1; } #ifdef LLIL_SORT_VIA_MULTIMAP_L std::cerr << "llil start (multimap version)\n"; #else std::cerr << "llil start (sort version)\n"; #endif time_t tstart1 = ::time(NULL); // Create the hash of properties map_str_int_type hash_ret; get_properties(argc - 1, &argv[1], hash_ret); time_t tend1 = ::time(NULL); long taken1 = static_cast(::difftime(tend1, tstart1) + 0.5); std::cerr << "get_properties : " << taken1 << " secs\n"; // Sort descending by value, i.e. mimic this Perl code in C++: // sort { $href->{$b} <=> $href->{$a} || $a cmp $b } keys %{$href} time_t tstart2 = ::time(NULL); #ifdef LLIL_SORT_VIA_MULTIMAP_L std::multimap newmap = invert_map(hash_ret); for (std::multimap::reverse_iterator it = newmap.rbegin(); it != newmap.rend(); ++it) { std::cout << it->second << '\t' << it->first << "\n"; } #else vec_str_int_type v( hash_ret.begin(), hash_ret.end() ); std::sort( v.begin(), v.end(), [](const str_int_type& left, const str_int_type& right) { return right.second != left.second ? right.second < left.second : left.first < right.first; } ); for ( const std::pair& n : v ) { std::cout << n.first << '\t' << n.second << "\n"; } #endif time_t tend2 = ::time(NULL); long taken2 = static_cast(::difftime(tend2, tstart2) + 0.5); long taken = static_cast(::difftime(tend2, tstart1) + 0.5); std::cerr << "sort + output : " << taken2 << " secs\n"; std::cerr << "total : " << taken << " secs\n"; return 0; } #### # gen-llil.pl # Crude program to generate a big LLiL test file to use in benchmarks # perl gen-llil.pl big2.txt 200 3 - produces a test file with size = 35,152,000 bytes use strict; use warnings; use autodie; { my $ordmin = ord('a'); my $ordmax = ord('z') + 1; # Generate a random word sub gen_random_word { my $word = shift; # word prefix my $nchar = shift; # the number of random chars to append for my $i (1 .. $nchar) { $word .= chr( $ordmin + int( rand($ordmax - $ordmin) ) ); } return $word; } } sub create_test_file { my $fname = shift; my $count = shift; my $wordlen = shift; open( my $fh_out, '>', $fname ); for my $c ( 'aaa' .. 'zzz' ) { for my $i (1 .. $count) { print {$fh_out} gen_random_word( $c, $wordlen ) . "\t" . 1 . "\n"; } } } my $outfile = shift; my $count = shift; my $wordlen = shift; $outfile or die "usage: $0 outfile count wordlen\n"; $count or die "usage: $0 outfile count wordlen\n"; print "generating test file '$outfile' with count '$count'\n"; create_test_file($outfile, $count, $wordlen); print "file size=", -s $outfile, "\n"; #### perl gen-llil.pl big1.txt 200 3 perl gen-llil.pl big2.txt 200 3 perl gen-llil.pl big3.txt 200 3 #### perl llil.pl big1.txt big2.txt big3.txt >perl.tmp #### llil big1.txt big2.txt big3.txt >cpp.tmp #### diff perl.tmp cpp.tmp #### > perl llil.pl big1.txt big2.txt big3.txt >perl.tmp llil start get_properties : 11 secs sort + output : 74 secs total : 85 secs #### > llil big1.txt big2.txt big3.txt >cpp.tmp llil start (sort version) get_properties : 9 secs sort + output : 7 secs total : 16 secs #### > diff cpp.tmp perl.tmp