in reply to Re: [OT] The interesting problem of comparing (long) bit-strings.
in thread [OT] The interesting problem of comparing bit-strings.
I'll attempt a brief description of the Alpha Skip Search. Hopefully, this may shed some light on the basic operation of the algorithm.
Consider a pattern of length M. The approach is to quickly locate any fractional substring within the pattern, wherever it may occur. Substrings of length F take offsets from 0 .. M-F, inclusive. Thus there is a set of M-F+1 needle fragments. The preprocessing stage creates an index of those fragments. A perl implementation might read as follows:
push @{$index{ substr($needle, $_, $F) }}, $_ for 0 .. $M-$F;
Search phase proceeds by inspecting the haystack one window at a time. Window shift is the same figure as above, M-F+1. Each time, a substring is extracted at the end of the window. The index now yields all possible alignments where this fragment will match in the pattern.
Theoretical optimum for F is logsM (log based on alphabet size). That amounts to one entry per index bucket, on the average. For random data, there will be one alignment to test per window, and this is likely to fail at the very start.
In the case of bit-string matching, the substrings are nothing but small integers below (1<<F), which suggests a very simple array-based indexing scheme (instead of trie). Here's a sample implementation in C (just a rehash of the previous demo, main difference being this one handles variable length needles).
// A demonstration of alpha skip search on bitstrings // // dd if=/dev/urandom bs=1k count=99k > garbage // ./a.out [length] [offset] < garbage // // Feed the program with random needle and haystack. Two arguments may + be given. // Haystack offset says where to hide the needle. Needle length defaul +ts to 65536. #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <unistd.h> // F = log(length) is theoretically optimal. Mmax is our maximum needl +e size. // if logM is changed, be sure to also check the bt_off_t below enum { logM = 32, Mmax = 1UL << logM, Mnil = Mmax - 1UL, Mdflt = 65536 + }; enum { F = 16, Ftop = 1UL << F, Fmask = Ftop - 1UL }; typedef uint32_t bt_off_t; // must hold 0..Mnil typedef struct { bt_off_t head[Ftop]; bt_off_t next[]; } bt_index; // extract F bits at bit offset i. assert(F <= 25) // this is non-portable static inline size_t fbits(char *v, size_t i) { uint32_t *p = (uint32_t *)&v[i >> 3]; return Fmask & (*p >> (i & 7)); } static size_t xcmp(char *y, size_t j, char *x, size_t w) { size_t i, diff = fbits(y, j) - fbits(x, 0); for (i = F; i < w && !diff; i += F) { diff = fbits(y, i + j) - fbits(x, i); } if (!diff) diff = fbits(y, w-1 + j) - fbits(x, w-1); return diff; } static bt_index *prep(char *x, size_t w) { bt_index *Z = malloc(sizeof(*Z) + w * sizeof(*Z->next)); size_t i, f; for (i = 0; i < Ftop; i++) { Z->head[i] = Mnil; } for (i = 0; i < w; i++) { f = fbits(x, i); Z->next[i] = Z->head[f]; Z->head[f] = i; } printf("bt_index %zu bytes\n", sizeof(*Z) + w * sizeof(*Z->next)); return Z; } // y of length n is haystack; x is the needle size_t search(char *y, size_t n, char *x, size_t m) { size_t w = m - F + 1; // shift size size_t i, j, f; bt_index *Z = prep(x, w); for (i = -1; (i += w) <= n - F;) { f = fbits(y, i); for (j = Z->head[f]; j != Mnil; j = Z->next[j]) { if (!xcmp(y, i-j, x, w)) return i-j; } } return -1; } // one-bit rotate void rot1(char *v, size_t n) { unsigned char *p = (unsigned char *)v; size_t i, c; for (c = i = 0; i < n; c >>= 8, i++) { p[i] = c += p[i] << 1; } p[0] += c; } int main(int argc, char *argv[]) { static char buf[(99<<20) + 1]; size_t n = read(0, buf, sizeof(buf) - 1); size_t t = argc > 1 ? strtoul(argv[1], NULL, 0) : 0; size_t m = (t < F || t - 1 > Mnil) ? Mdflt : t; char *x = buf, *y = buf + (m+7)/8; if ((long)n < 0 || (long)n < 2*(y-x)) { printf("read %ld bytes, need at least %ld\n", (long)n, (long)2 +*(y-x)); return 1; } n = 8 * (n - (y-x)); printf("haystack=%zu needle=%zu (bits)\n", n, m); if (argc > 2) { size_t r, i = strtoul(argv[2], NULL, 0); i += (long)i < 0 ? (n-m) : 0; i = i < (n-m) ? i : (n-m); memcpy(y + i/8, x, (y-x)); // stuff the needle and rotate for (r = i & 7; r; r--) rot1(y + i/8, (y-x) + 1); printf("needle at %zu (byte offset %zu, rot %zu)\n", i, i/8, i +%8); } printf("search=%ld\n", (long)search(y, n, x, m)); return 0; }
Tested with gcc/clang under linux; my apologies if the code is too obscure, or otherwise unworkable.
Update. Simplified above code. Note that it is sometimes difficult to put a name on a string search routine, especially with bit-strings. Minor modification can turn one search into another, and the variants abound. Here's a version that might be called Horspool.
// A demonstration of Horspool variant on bitstrings // // dd if=/dev/urandom bs=1k count=99k > garbage // ./a.out [length] [offset] < garbage // // Feed the program with random needle and haystack. Two arguments may + be given. // Haystack offset says where to hide the needle. Needle length defaul +ts to 65536. #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include <unistd.h> // if logM is changed, be sure to also check the bt_off_t below enum { logM = 32, Mmax = 1UL << logM, Mnil = Mmax - 1UL, Mdflt = 65536 + }; enum { F = 16, Ftop = 1UL << F, Fmask = Ftop - 1UL }; typedef uint32_t bt_off_t; // must hold 0..Mnil typedef struct { bt_off_t head[Ftop]; } bt_index; // extract F bits at bit offset i. assert(F <= 25) // this is non-portable static inline size_t fbits(char *v, size_t i) { uint32_t *p = (uint32_t *)&v[i >> 3]; return Fmask & (*p >> (i & 7)); } static size_t xcmp(char *y, size_t j, char *x, size_t w) { size_t i, diff = fbits(y, j) - fbits(x, 0); for (i = F; i < w && !diff; i += F) { diff = fbits(y, i + j) - fbits(x, i); } if (!diff) diff = fbits(y, w-1 + j) - fbits(x, w-1); return diff; } static bt_index *prep(char *x, size_t w) { static bt_index Z[1]; size_t i, f; for (i = 0; i < Ftop; i++) { Z->head[i] = Mnil; } for (i = 0; i < w; i++) { f = fbits(x, i); Z->head[f] = i; } printf("bt_index %zu bytes\n", sizeof(*Z)); return Z; } // y of length n is haystack; x is the needle size_t search(char *y, size_t n, char *x, size_t m) { size_t w = m - F + 1; // shift size size_t i, j, f; bt_index *Z = prep(x, w); for (i = -1; (i += w) <= n - F;) { f = fbits(y, i); if ((j = Z->head[f]) != Mnil) { i -= j; if (!xcmp(y, i, x, w)) return i; } } return -1; } // one-bit rotate void rot1(char *v, size_t n) { unsigned char *p = (unsigned char *)v; size_t i, c; for (c = i = 0; i < n; c >>= 8, i++) { p[i] = c += p[i] << 1; } p[0] += c; } int main(int argc, char *argv[]) { static char buf[(99<<20) + 1]; size_t n = read(0, buf, sizeof(buf) - 1); size_t t = argc > 1 ? strtoul(argv[1], NULL, 0) : 0; size_t m = (t < F || t - 1 > Mnil) ? Mdflt : t; char *x = buf, *y = buf + (m+7)/8; if ((long)n < 0 || (long)n < 2*(y-x)) { printf("read %ld bytes, need at least %ld\n", (long)n, (long)2 +*(y-x)); return 1; } n = 8 * (n - (y-x)); printf("haystack=%zu needle=%zu (bits)\n", n, m); if (argc > 2) { size_t r, i = strtoul(argv[2], NULL, 0); i += (long)i < 0 ? (n-m) : 0; i = i < (n-m) ? i : (n-m); memcpy(y + i/8, x, (y-x)); // stuff the needle and rotate for (r = i & 7; r; r--) rot1(y + i/8, (y-x) + 1); printf("needle at %zu (byte offset %zu, rot %zu)\n", i, i/8, i +%8); } printf("search=%ld\n", (long)search(y, n, x, m)); return 0; }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^3: [OT] The interesting problem of ... bit-strings (alpha skip search)
by BrowserUk (Patriarch) on Apr 01, 2015 at 11:47 UTC |