#include #include ... // tkrzw::HashDBM dbm; // dbm.Open("/dev/shm/casket.tkh", true).OrDie(); tkrzw::ShardDBM dbm; const std::map params = { {"num_shards", "8"}, {"dbm", "HashDBM"} }; dbm.OpenAdvanced("/dev/shm/casket.tkh", true, tkrzw::File::OPEN_DEFAULT, params); for (int i = 0; i < nfiles; ++i) get_properties(fname[i], nthds, dbm); dbm.Close(); #### casket.tkh-00000-of-00008 casket.tkh-00001-of-00008 casket.tkh-00002-of-00008 casket.tkh-00003-of-00008 casket.tkh-00004-of-00008 casket.tkh-00005-of-00008 casket.tkh-00006-of-00008 casket.tkh-00007-of-00008 #### 8 shards : 11.958 secs 7.643 mil QPS 16 shards : 6.680 secs 13.682 mil QPS 32 shards : 4.424 secs 20.659 mil QPS 64 shards : 3.419 secs 26.732 mil QPS 96 shards : 3.052 secs 29.946 mil QPS 128 shards : 2.903 secs 31.483 mil QPS #### #include ... // Process max Nthreads chunks concurrently. while (first < last) { char* beg_ptr{first}; first = find_char(first, last, '\n'); char* end_ptr{first}; ++first; if ((found = find_char(beg_ptr, end_ptr, '\t')) == end_ptr) continue; count = fast_atoll64(found + 1); klen = std::min(MAX_STR_LEN_L, (size_t)(found - beg_ptr)); std::basic_string_view key{ reinterpret_cast(beg_ptr), klen }; dbm_ret.IncrementSimple(key, count); // std::string value = dbm_ret.GetSimple(key); // int64_t *bigendian_num = reinterpret_cast(value.data()); // std::cout << key << ": " << bswap_64(*bigendian_num) << "\n"; } #### // Store the properties into a vector vec_str_int_type propvec; propvec.reserve(num_keys); std::string key, value; int64_t *bigendian_num = reinterpret_cast(value.data()); std::unique_ptr iter = dbm.MakeIterator(); iter->First(); while (iter->Get(&key, &value) == tkrzw::Status::SUCCESS) { propvec.emplace_back(key, bswap_64(*bigendian_num)); iter->Next(); } dbm.Close(); #### $ NUM_THREADS=8 NUM_MAPS=4 ./llil4tkh big{1,2,3}.txt | cksum llil4tkh (fixed string length=12) start use OpenMP use boost sort get properties 2.978 secs shardDBM to vector 5.848 secs vector stable sort 0.157 secs write stdout 0.213 secs total time 9.197 secs 2956888413 93308427 #### $ perl llilthc.pl --threads=8 --maps=4 big{1,2,3}.txt | cksum Tokyo Cabinet hash database - start fixed string length=12, threads=8, maps=4 get properties : 5.487 secs pack properties : 3.545 secs sort packed data : 0.969 secs write stdout : 0.764 secs total time : 10.769 secs count lines : 10545600 count unique : 10367603 2956888413 93308427 #### // Store the properties into a vector vec_str_int_type propvec; propvec.reserve(num_keys); #pragma omp parallel for schedule(static, 1) for (int i = 0; i < nmaps; ++i) { // casket.tkh-00000-of-00004 // casket.tkh-00001-of-00004 // casket.tkh-00002-of-00004 // casket.tkh-00003-of-00004 char path[255]; std::sprintf(path, "/dev/shm/casket.tkh-%05d-of-%05d", i, nmaps); tkrzw::HashDBM dbm; dbm.Open(path, false).OrDie(); int64_t num_keys = dbm.CountSimple(); if (num_keys > 0) { vec_str_int_type locvec; locvec.reserve(num_keys); std::string key, value; int64_t *bigendian_num = reinterpret_cast(value.data()); std::unique_ptr iter = dbm.MakeIterator(); iter->First(); while (iter->Get(&key, &value) == tkrzw::Status::SUCCESS) { locvec.emplace_back(key, bswap_64(*bigendian_num)); iter->Next(); } #pragma omp critical propvec.insert( // Append local vector to propvec propvec.end(), std::make_move_iterator(locvec.begin()), std::make_move_iterator(locvec.end()) ); } dbm.Close(); } #### $ NUM_THREADS=8 NUM_MAPS=4 ./llil4tkh big{1,2,3}.txt | cksum llil4tkh (fixed string length=12) start use OpenMP use boost sort get properties 2.985 secs shardDBM to vector 1.381 secs vector stable sort 0.157 secs write stdout 0.214 secs total time 4.739 secs 2956888413 93308427 $ NUM_THREADS=8 NUM_MAPS=8 ./llil4tkh big{1,2,3}.txt | cksum llil4tkh (fixed string length=12) start use OpenMP use boost sort get properties 2.106 secs shardDBM to vector 0.683 secs vector stable sort 0.159 secs write stdout 0.208 secs total time 3.157 secs 2956888413 93308427 $ NUM_THREADS=8 NUM_MAPS=32 ./llil4tkh big{1,2,3}.txt | cksum llil4tkh (fixed string length=12) start use OpenMP use boost sort get properties 1.364 secs shardDBM to vector 0.639 secs vector stable sort 0.159 secs write stdout 0.207 secs total time 2.372 secs 2956888413 93308427 #### $ perl lliltch.pl --threads=48 --maps=max in/biga* | cksum Tokyo Cabinet hash database - start fixed string length=12, threads=48, maps=128 get properties : 9.533 secs 9.587 mil QPS pack properties : 3.276 secs 24.151 mil QPS sort packed data : 6.826 secs write stdout : 1.631 secs total time : 21.284 secs count lines : 91395200 count unique : 79120065 2005669956 712080585 $ NUM_THREADS=48 NUM_MAPS=128 ./llil4tkh in/biga* | cksum llil4tkh (fixed string length=12) start sharding managed by the tkrzw::ShardDBM library use OpenMP use boost sort get properties 2.872 secs 31.823 mil QPS shardDBM to vector 1.546 secs 51.177 mil QPS vector stable sort 1.399 secs write stdout 1.561 secs total time 7.380 secs 2005669956 712080585 #### $ NUM_THREADS=48 NUM_MAPS=128 ./llil4tkh2 in/biga* | cksum llil4tkh2 (fixed string length=12) start sharding managed by the application use OpenMP use boost sort get properties 2.337 secs 39.108 mil QPS hashDBMs to vector 1.607 secs 49.235 mil QPS vector stable sort 1.379 secs write stdout 1.576 secs total time 6.900 secs 2005669956 712080585 #### $ NUM_THREADS=48 NUM_MAPS=128 ./llil4tkh2 in/biga* | cksum llil4tkh2 (fixed string length=12) start sharding managed by the application use OpenMP use boost sort get properties 2.331 secs 39.209 mil QPS hashDBMs to vector 1.420 secs 55.718 mil QPS vector stable sort 0.663 secs write stdout 1.541 secs total time 5.957 secs 2005669956 712080585