Re: Bidirectional lookup algorithm? (try perfect hashing)

Replies are listed 'Best First'.
Re^2: Bidirectional lookup algorithm? (try perfect hashing) by oiskuu (Hermit) on Jan 11, 2015 at 02:52 UTC
Well, I experimented a little further with the CMPH. It might be rough around the edges (e.g. temp. files aren't cleaned up or created safely), but the good thing is it basically works. First, I generated some data as follows.... that took awhile. `#! /bin/sh perl -E '$/ = \36; for (1 .. 200e6) { ($v,$s) = unpack q<QA>,<>; say +qq($_ @{[$s =~ y/a-zA-Z//cdr \|\| redo]} $v) }' /dev/urandom \| sort -k2 \| perl -ape '$_ x= $F ne $F[1], $F = $F[1]' \| sort -k3 \| perl -ape '$_ x= $F ne $F[2], $F = $F[2]' \| sort -n \| perl -pe 's/\S\s//'` [download] In case many longer keys are present, it might be better to go with (4-byte) offset records. Simple and compact, but there's one more indirection, hence slower access. `$ perl -anE '$h[length$F[0]]++ }{ say "@h";' data 52 2704 140608 7190883 35010259 35855576 28751420 19240344 10899542 5 +278104 2202814 795438 249757 68269 16155 3388 640 89 12 1` [download] Then a small test. You can see I didn't bother mapping value indexes but used order-preserving hash instead. Memory usage comes to about 26 bytes/pair. Double that while building the hashes. Edit.* Same test with updated code; higher -O used. `[ 1.303844] data ALLOCATED; tab = 160002048, ss = 14680064 (10 +000000 pairs) [ 5.478031] built BDZ for syms [ 2.873171] inplace REORDER [ 20.015568] built CHM for vals [ 0.000028] mph size when packed: syms = 3459398, vals = 83600 +028 [ 0.522367] fgets loop; lines=10000000 [ 1.195339] fgets+strtoul; lines=10000000 [ 2.235220] SYMS fetch; found=10000000 [ 2.940386] VALS fetch; found=10000000 [ 2.709484] VRFY sym to val; matched=10000000 [ 4.258673] VRFY two-way; matched=10000000` [download] ~~Old output.~~ $ ./a.out 10000000 data [0000000000.000000] begin [0000000001.299480] data READ and ALLOCATED N=10000000; tab = 160002048, ss = 14680064 [0000000008.945925] built BDZ for syms [0000000000.003773] mph dump & reload [0000000005.915059] inplace REORDER done [0000000036.457684] built CHM for vals [0000000000.086798] mph dump & reload mph size when packed: syms = 3459398, vals = 83600028 [0000000006.374049] completed VERIFY [0000000000.518565] fgets loop [0000000001.192738] fgets+strtoul loop [0000000003.706227] SYMS search and compare cksum=49999995000000 [0000000004.366025] VALS search and compare cksum=49999995000000 [0000000000.000012] done. [download]	[reply] [d/l] [select]
Re^3: Bidirectional lookup algorithm? (try perfect hashing) by BrowserUk (Patriarch) on Jan 11, 2015 at 09:46 UTC
oiskuu. Thanks for pursuing this and posting your findings. Unfortunately, I do not understand what I am reading? You read 200 million 36 byte records from /dev/urandom; Then you unpack them as: a)a 64-bit uint; and b) the rest as a string, from which you remove any non alpha characters (with redo if nothing is left); And print the string and int to stdout; Which you pipe through: sort & perl & sort & perl & sort & perl to ??? You then count the lengths of the first fields from a file called data? You then run an executable call a.out supplying a number and the data file? Which produces some output? With the rise and rise of 'Social' network sites: 'Computers are making people easier to use everyday' Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error. "Science is about questioning the status quo. Questioning authority". In the absence of evidence, opinion is indistinguishable from prejudice. Agile (and TDD) debunked	[reply]
Re^2: Bidirectional lookup algorithm? (try perfect hashing) by oiskuu (Hermit) on Jan 13, 2015 at 02:21 UTC
Ok. Here's a crude demo in case someone is interested in doing benchmarking comparisons or whatever. The program is pure C, tested on linux 64-bit. Give two arguments: Nkeys and Filename. File should contain sym-value pairs, both unique in their domains. (Space-separated, one pair per line). #define _GNU_SOURCE // for mremap #include <stdio.h> #include <string.h> #include <sys/mman.h> #include <cmph.h> #include <stdarg.h> #include <sys/time.h> __attribute__((format(printf,1,2))) static void ttprintf(char fmt, ...) { static struct timeval tv; struct timeval t0 = tv; gettimeofday(&tv, NULL); int c = (tv.tv_usec < t0.tv_usec); printf("[%10ld.%06ld] ", tv.tv_sec - t0.tv_sec - c, tv.tv_usec - t0.tv_usec + 1000000 c); va_list ap; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("\n"); } enum { XMTOP = 2UL << 20 }; // 2 MB increments struct xm { void addr; size_t size, used; }; static void xmalloc(struct xm m, size_t len) { m->size = (len + 0xfff) & ~(size_t)0xfff; m->used = 0; m->addr = mmap(NULL, m->size, PROT_READ\|PROT_WRITE, MAP_PRIVATE\|MA +P_ANONYMOUS, -1, 0); if (m->addr == MAP_FAILED) perror("mmap"), exit(1); return m->addr; } static void xmtop(struct xm m, size_t reserve) { if (m->size < m->used + reserve) { m->size += XMTOP; m->addr = mremap(m->addr, m->size - XMTOP, m->size, MREMAP_MAY +MOVE); if (m->addr == MAP_FAILED) perror("mremap"), exit(1); } return (char )m->addr + m->used; } static void memdup(void src, size_t len) { void dst = malloc(len); if (!dst) perror("malloc"), exit(1); return memcpy(dst, src, len); } static unsigned long atoul(char a) { return strtoul(a, NULL, 10); } static FILE f_open(char fnam, char mode) { FILE f = fopen(fnam, mode); if (!f) perror(fnam), exit(1); return f; } static void f_rewind(FILE fp) { if (fseek(fp, 0, SEEK_SET)) perror("fseek"), exit(1); } typedef cmph_uint32 klen_t; typedef size_t bm_idx; typedef struct { struct xm tab, ss; bm_idx nkeys, pos; cmph_t symhash; cmph_t valhash; } bimap; typedef union { char a[8]; size_t i; size_t o; //size_t o:32; // big-endian: use fewer bits so .a[7] stays 0 } bm_ent; typedef struct { bm_ent s, v; } bm_tab; static inline bm_ent bm_tab_val(bimap bm, bm_idx i) { return &((bm_tab )bm->tab.addr)[i].v; } static inline bm_ent bm_tab_sym(bimap bm, bm_idx i) { return &((bm_tab )bm->tab.addr)[i].s; } // assume the keys are always ' ' terminated! static inline void bm_fill_sym(bimap bm, bm_idx i, char key, size_t +len) { bm_ent s = bm_tab_sym(bm, i); if (len > sizeof(s->a)) { s->o = bm->ss.used; bm->ss.used += len + 1; return; } memcpy(s->a, key, sizeof(s->a)); if (len < sizeof(s->a)) s->a[7] = ' '; } static inline void bm_get_key(bimap bm, bm_idx i, char key, klen_t +keylen) { bm_ent s = bm_tab_sym(bm, i); if (s->a[7]) { char p = memchr(s->a, ' ', sizeof(s->a)); key = s->a; keylen = p ? p - s->a : sizeof(s->a); } else { char p = (char)bm->ss.addr + s->o; key = p; keylen = strchr(p, ' ') - p; } } static void bm_tab_reorder(bimap bm) { bm_idx i, id; char key; klen_t keylen; for (i = 0; i < bm->nkeys;) { bm_get_key(bm, i, &key, &keylen); id = cmph_search(bm->symhash, key, keylen); if (id != i) { bm_tab tmp, t = bm->tab.addr; tmp = t[id], t[id] = t[i], t[i] = tmp; } else i++; } } static bm_tab fetch_by_key(bimap bm, char key, klen_t keylen) { bm_idx id = cmph_search(bm->symhash, key, keylen); bm_tab t = (bm_tab )bm->tab.addr + id; char p = t->s.a[7] ? t->s.a : (char)bm->ss.addr + t->s.o; if (memcmp(p, key, keylen)) return NULL; if (keylen != sizeof(t->s.a) && p[keylen] != ' ') return NULL; return t; } static bm_tab fetch_by_val(bimap bm, size_t val) { bm_ent v = {}; v.i = val; bm_idx id = cmph_search(bm->valhash, v.a, sizeof(v.a)); bm_tab t = (bm_tab )bm->tab.addr + id; if (t->v.i != v.i) return NULL; return t; } // malloc-ated keys are expected by FCH, but that's too slow anyway.. int foo_adapter_read_key(void data, char key, cmph_uint32 keylen) { bimap bm = data; bm_get_key(bm, bm->pos++, key, keylen); // key = memdup(key, keylen); return keylen; } int foo_adapter_read_val(void data, char *key, cmph_uint32 keylen) { bimap bm = data; bm_ent v = bm_tab_val(bm, bm->pos++); key = v->a; keylen = sizeof(v->a); // key = memdup(key, keylen); return keylen; } void foo_adapter_dispose(void data, char key, cmph_uint32 keylen) { // free(key); } void foo_adapter_rewind(void data) { bimap bm = data; bm->pos = 0; } cmph_io_adapter_t foo_adapter_new(bimap bm, int dir) { cmph_io_adapter_t source = { .data = bm, .nkeys = bm->nkeys, .read = dir ? foo_adapter_read_key : foo_adapter_read_val, .dispose = foo_adapter_dispose, .rewind = foo_adapter_rewind }; return memdup(&source, sizeof(source)); } void foo_adapter_destroy(cmph_io_adapter_t source) { free(source); } static cmph_t build_mph(cmph_io_adapter_t source, int algo, char df +ile) { cmph_config_t config; cmph_t r; FILE mphf = NULL; config = cmph_config_new(source); cmph_config_set_algo(config, algo); //cmph_config_set_verbosity(config, 1); if (dfile) { mphf = f_open(dfile, "w+"); cmph_config_set_mphf_fd(config, mphf); } r = cmph_new(config); if (!r) printf("no cigar"), exit(2); if (dfile) { // ....BRZ needs reload, will crash otherwise? cmph_dump(r, mphf); cmph_destroy(r); f_rewind(mphf); r = cmph_load(mphf); fclose(mphf); } cmph_config_destroy(config); return r; } int main(int argc, char argv) { enum { LLEN = 1024 }; FILE f; bimap M; size_t n, i, ck; cmph_io_adapter_t source; if (argc < 3 \|\| (n = atoul(argv[1])) < 3) exit(1); f = f_open(argv[2], "r"); xmalloc(&M.tab, n sizeof(bm_tab)); xmalloc(&M.ss, XMTOP); ttprintf("begin"); for (i = 0; i < n; i++) { char e, p = xmtop(&M.ss, LLEN); if (!fgets(p, LLEN, f) \|\| !(e = strchr(p, ' '))) break; bm_fill_sym(&M, i, p, e-p); bm_tab_val(&M, i)->i = atoul(e); } M.nkeys = n = i; ttprintf("data ALLOCATED; tab = %zu, ss = %zu (%zu pairs)", M.tab. +size, M.ss.size, n); source = foo_adapter_new(&M, 1); M.symhash = build_mph(source, CMPH_BDZ, NULL); foo_adapter_destroy(source); ttprintf("built BDZ for syms"); bm_tab_reorder(&M); ttprintf("inplace REORDER"); source = foo_adapter_new(&M, 0); M.valhash = build_mph(source, CMPH_CHM, NULL); foo_adapter_destroy(source); ttprintf("built CHM for vals"); // cmph_search_packed() was a tad slower in my testing. pointers v +s offsets probably. ttprintf("mph size when packed: syms = %zu, vals = %zu", (size_t)cmph_packed_size(M.symhash), (size_t)cmph_packed_size( +M.valhash)); #define LOOPIT(foo) \ f_rewind(f); \ for (ck = i = 0; i < n; i++) { \ char *e, p[LLEN]; \ if (!fgets(p, LLEN, f)) \ break; \ if (!(e=strchr(p, ' ')))\ break; \ foo; \ } LOOPIT(ck++) ttprintf("fgets loop; lines=%zu", ck); LOOPIT(ck++; atoul(e)) ttprintf("fgets+strtoul; lines=%zu", ck); LOOPIT(ck += !!fetch_by_key(&M, p, e-p)) ttprintf("SYMS fetch; found=%zu", ck); LOOPIT(ck += !!fetch_by_val(&M, atoul(e))) ttprintf("VALS fetch; found=%zu", ck); LOOPIT(ck += fetch_by_key(&M, p, e-p)->v.i == atoul(e)) ttprintf("VRFY sym to val; matched=%zu", ck); LOOPIT(ck += fetch_by_key(&M, p, e-p) == fetch_by_val(&M, atoul(e) +)) ttprintf("VRFY two-way; matched=%zu", ck); fclose(f); ttprintf("done."); return 0; } [download] Update. Some additional notes regarding the program. Read more... (1139 Bytes)	[reply] [d/l] [select]