// A demonstration of alpha skip search on bitstrings // // dd if=/dev/urandom bs=1k count=99k > garbage // ./a.out [length] [offset] < garbage // // Feed the program with random needle and haystack. Two arguments may be given. // Haystack offset says where to hide the needle. Needle length defaults to 65536. #include #include #include #include #include // F = log(length) is theoretically optimal. Mmax is our maximum needle size. // if logM is changed, be sure to also check the bt_off_t below enum { logM = 32, Mmax = 1UL << logM, Mnil = Mmax - 1UL, Mdflt = 65536 }; enum { F = 16, Ftop = 1UL << F, Fmask = Ftop - 1UL }; typedef uint32_t bt_off_t; // must hold 0..Mnil typedef struct { bt_off_t head[Ftop]; bt_off_t next[]; } bt_index; // extract F bits at bit offset i. assert(F <= 25) // this is non-portable static inline size_t fbits(char *v, size_t i) { uint32_t *p = (uint32_t *)&v[i >> 3]; return Fmask & (*p >> (i & 7)); } static size_t xcmp(char *y, size_t j, char *x, size_t w) { size_t i, diff = fbits(y, j) - fbits(x, 0); for (i = F; i < w && !diff; i += F) { diff = fbits(y, i + j) - fbits(x, i); } if (!diff) diff = fbits(y, w-1 + j) - fbits(x, w-1); return diff; } static bt_index *prep(char *x, size_t w) { bt_index *Z = malloc(sizeof(*Z) + w * sizeof(*Z->next)); size_t i, f; for (i = 0; i < Ftop; i++) { Z->head[i] = Mnil; } for (i = 0; i < w; i++) { f = fbits(x, i); Z->next[i] = Z->head[f]; Z->head[f] = i; } printf("bt_index %zu bytes\n", sizeof(*Z) + w * sizeof(*Z->next)); return Z; } // y of length n is haystack; x is the needle size_t search(char *y, size_t n, char *x, size_t m) { size_t w = m - F + 1; // shift size size_t i, j, f; bt_index *Z = prep(x, w); for (i = -1; (i += w) <= n - F;) { f = fbits(y, i); for (j = Z->head[f]; j != Mnil; j = Z->next[j]) { if (!xcmp(y, i-j, x, w)) return i-j; } } return -1; } // one-bit rotate void rot1(char *v, size_t n) { unsigned char *p = (unsigned char *)v; size_t i, c; for (c = i = 0; i < n; c >>= 8, i++) { p[i] = c += p[i] << 1; } p[0] += c; } int main(int argc, char *argv[]) { static char buf[(99<<20) + 1]; size_t n = read(0, buf, sizeof(buf) - 1); size_t t = argc > 1 ? strtoul(argv[1], NULL, 0) : 0; size_t m = (t < F || t - 1 > Mnil) ? Mdflt : t; char *x = buf, *y = buf + (m+7)/8; if ((long)n < 0 || (long)n < 2*(y-x)) { printf("read %ld bytes, need at least %ld\n", (long)n, (long)2*(y-x)); return 1; } n = 8 * (n - (y-x)); printf("haystack=%zu needle=%zu (bits)\n", n, m); if (argc > 2) { size_t r, i = strtoul(argv[2], NULL, 0); i += (long)i < 0 ? (n-m) : 0; i = i < (n-m) ? i : (n-m); memcpy(y + i/8, x, (y-x)); // stuff the needle and rotate for (r = i & 7; r; r--) rot1(y + i/8, (y-x) + 1); printf("needle at %zu (byte offset %zu, rot %zu)\n", i, i/8, i%8); } printf("search=%ld\n", (long)search(y, n, x, m)); return 0; }