#define _GNU_SOURCE // for mremap #include #include #include #include #include #include __attribute__((format(printf,1,2))) static void ttprintf(char *fmt, ...) { static struct timeval tv; struct timeval t0 = tv; gettimeofday(&tv, NULL); int c = (tv.tv_usec < t0.tv_usec); printf("[%10ld.%06ld] ", tv.tv_sec - t0.tv_sec - c, tv.tv_usec - t0.tv_usec + 1000000 * c); va_list ap; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("\n"); } enum { XMTOP = 2UL << 20 }; // 2 MB increments struct xm { void *addr; size_t size, used; }; static void *xmalloc(struct xm *m, size_t len) { m->size = (len + 0xfff) & ~(size_t)0xfff; m->used = 0; m->addr = mmap(NULL, m->size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (m->addr == MAP_FAILED) perror("mmap"), exit(1); return m->addr; } static void *xmtop(struct xm *m, size_t reserve) { if (m->size < m->used + reserve) { m->size += XMTOP; m->addr = mremap(m->addr, m->size - XMTOP, m->size, MREMAP_MAYMOVE); if (m->addr == MAP_FAILED) perror("mremap"), exit(1); } return (char *)m->addr + m->used; } static void *memdup(void *src, size_t len) { void *dst = malloc(len); if (!dst) perror("malloc"), exit(1); return memcpy(dst, src, len); } static unsigned long atoul(char *a) { return strtoul(a, NULL, 10); } static FILE *f_open(char *fnam, char *mode) { FILE *f = fopen(fnam, mode); if (!f) perror(fnam), exit(1); return f; } static void f_rewind(FILE *fp) { if (fseek(fp, 0, SEEK_SET)) perror("fseek"), exit(1); } typedef cmph_uint32 klen_t; typedef size_t bm_idx; typedef struct { struct xm tab, ss; bm_idx nkeys, pos; cmph_t *symhash; cmph_t *valhash; } bimap; typedef union { char a[8]; size_t i; size_t o; //size_t o:32; // big-endian: use fewer bits so .a[7] stays 0 } bm_ent; typedef struct { bm_ent s, v; } bm_tab; static inline bm_ent *bm_tab_val(bimap *bm, bm_idx i) { return &((bm_tab *)bm->tab.addr)[i].v; } static inline bm_ent *bm_tab_sym(bimap *bm, bm_idx i) { return &((bm_tab *)bm->tab.addr)[i].s; } // assume the keys are always ' ' terminated! static inline void bm_fill_sym(bimap *bm, bm_idx i, char *key, size_t len) { bm_ent *s = bm_tab_sym(bm, i); if (len > sizeof(s->a)) { s->o = bm->ss.used; bm->ss.used += len + 1; return; } memcpy(s->a, key, sizeof(s->a)); if (len < sizeof(s->a)) s->a[7] = ' '; } static inline void bm_get_key(bimap *bm, bm_idx i, char **key, klen_t *keylen) { bm_ent *s = bm_tab_sym(bm, i); if (s->a[7]) { char *p = memchr(s->a, ' ', sizeof(s->a)); *key = s->a; *keylen = p ? p - s->a : sizeof(s->a); } else { char *p = (char*)bm->ss.addr + s->o; *key = p; *keylen = strchr(p, ' ') - p; } } static void bm_tab_reorder(bimap *bm) { bm_idx i, id; char *key; klen_t keylen; for (i = 0; i < bm->nkeys;) { bm_get_key(bm, i, &key, &keylen); id = cmph_search(bm->symhash, key, keylen); if (id != i) { bm_tab tmp, *t = bm->tab.addr; tmp = t[id], t[id] = t[i], t[i] = tmp; } else i++; } } static bm_tab *fetch_by_key(bimap *bm, char *key, klen_t keylen) { bm_idx id = cmph_search(bm->symhash, key, keylen); bm_tab *t = (bm_tab *)bm->tab.addr + id; char *p = t->s.a[7] ? t->s.a : (char*)bm->ss.addr + t->s.o; if (memcmp(p, key, keylen)) return NULL; if (keylen != sizeof(t->s.a) && p[keylen] != ' ') return NULL; return t; } static bm_tab *fetch_by_val(bimap *bm, size_t val) { bm_ent v = {}; v.i = val; bm_idx id = cmph_search(bm->valhash, v.a, sizeof(v.a)); bm_tab *t = (bm_tab *)bm->tab.addr + id; if (t->v.i != v.i) return NULL; return t; } // malloc-ated keys are expected by FCH, but that's too slow anyway.. int foo_adapter_read_key(void *data, char **key, cmph_uint32 *keylen) { bimap *bm = data; bm_get_key(bm, bm->pos++, key, keylen); // *key = memdup(*key, *keylen); return *keylen; } int foo_adapter_read_val(void *data, char **key, cmph_uint32 *keylen) { bimap *bm = data; bm_ent *v = bm_tab_val(bm, bm->pos++); *key = v->a; *keylen = sizeof(v->a); // *key = memdup(*key, *keylen); return *keylen; } void foo_adapter_dispose(void *data, char *key, cmph_uint32 keylen) { // free(key); } void foo_adapter_rewind(void *data) { bimap *bm = data; bm->pos = 0; } cmph_io_adapter_t *foo_adapter_new(bimap *bm, int dir) { cmph_io_adapter_t source = { .data = bm, .nkeys = bm->nkeys, .read = dir ? foo_adapter_read_key : foo_adapter_read_val, .dispose = foo_adapter_dispose, .rewind = foo_adapter_rewind }; return memdup(&source, sizeof(source)); } void foo_adapter_destroy(cmph_io_adapter_t *source) { free(source); } static cmph_t *build_mph(cmph_io_adapter_t *source, int algo, char *dfile) { cmph_config_t *config; cmph_t *r; FILE *mphf = NULL; config = cmph_config_new(source); cmph_config_set_algo(config, algo); //cmph_config_set_verbosity(config, 1); if (dfile) { mphf = f_open(dfile, "w+"); cmph_config_set_mphf_fd(config, mphf); } r = cmph_new(config); if (!r) printf("no cigar"), exit(2); if (dfile) { // ....BRZ needs reload, will crash otherwise? cmph_dump(r, mphf); cmph_destroy(r); f_rewind(mphf); r = cmph_load(mphf); fclose(mphf); } cmph_config_destroy(config); return r; } int main(int argc, char **argv) { enum { LLEN = 1024 }; FILE *f; bimap M; size_t n, i, ck; cmph_io_adapter_t *source; if (argc < 3 || (n = atoul(argv[1])) < 3) exit(1); f = f_open(argv[2], "r"); xmalloc(&M.tab, n * sizeof(bm_tab)); xmalloc(&M.ss, XMTOP); ttprintf("begin"); for (i = 0; i < n; i++) { char *e, *p = xmtop(&M.ss, LLEN); if (!fgets(p, LLEN, f) || !(e = strchr(p, ' '))) break; bm_fill_sym(&M, i, p, e-p); bm_tab_val(&M, i)->i = atoul(e); } M.nkeys = n = i; ttprintf("data ALLOCATED; tab = %zu, ss = %zu (%zu pairs)", M.tab.size, M.ss.size, n); source = foo_adapter_new(&M, 1); M.symhash = build_mph(source, CMPH_BDZ, NULL); foo_adapter_destroy(source); ttprintf("built BDZ for syms"); bm_tab_reorder(&M); ttprintf("inplace REORDER"); source = foo_adapter_new(&M, 0); M.valhash = build_mph(source, CMPH_CHM, NULL); foo_adapter_destroy(source); ttprintf("built CHM for vals"); // cmph_search_packed() was a tad slower in my testing. pointers vs offsets probably. ttprintf("mph size when packed: syms = %zu, vals = %zu", (size_t)cmph_packed_size(M.symhash), (size_t)cmph_packed_size(M.valhash)); #define LOOPIT(foo) \ f_rewind(f); \ for (ck = i = 0; i < n; i++) { \ char *e, p[LLEN]; \ if (!fgets(p, LLEN, f)) \ break; \ if (!(e=strchr(p, ' ')))\ break; \ foo; \ } LOOPIT(ck++) ttprintf("fgets loop; lines=%zu", ck); LOOPIT(ck++; atoul(e)) ttprintf("fgets+strtoul; lines=%zu", ck); LOOPIT(ck += !!fetch_by_key(&M, p, e-p)) ttprintf("SYMS fetch; found=%zu", ck); LOOPIT(ck += !!fetch_by_val(&M, atoul(e))) ttprintf("VALS fetch; found=%zu", ck); LOOPIT(ck += fetch_by_key(&M, p, e-p)->v.i == atoul(e)) ttprintf("VRFY sym to val; matched=%zu", ck); LOOPIT(ck += fetch_by_key(&M, p, e-p) == fetch_by_val(&M, atoul(e))) ttprintf("VRFY two-way; matched=%zu", ck); fclose(f); ttprintf("done."); return 0; }