From 5a45d15894ce64f064be11514beb5faf0b8a3731 Mon Sep 17 00:00:00 2001 From: Christian Mehlis Date: Sun, 4 Aug 2013 22:01:11 +0200 Subject: [PATCH] initial bloom filter import --- sys/Makefile | 3 + sys/bloom/Makefile | 5 + sys/bloom/bloom.c | 267 +++++++++++++++++++++++++++++++++++++++++++ sys/include/bloom.h | 22 ++++ sys/include/hashes.h | 164 ++++++++++++++++++++++++++ 5 files changed, 461 insertions(+) create mode 100644 sys/bloom/Makefile create mode 100644 sys/bloom/bloom.c create mode 100644 sys/include/bloom.h create mode 100644 sys/include/hashes.h diff --git a/sys/Makefile b/sys/Makefile index d0f2362953..66bbdbe6bd 100644 --- a/sys/Makefile +++ b/sys/Makefile @@ -82,6 +82,9 @@ endif ifneq (,$(findstring ieee802154,$(USEMODULE))) DIRS += net/ieee802154 endif +ifneq (,$(findstring bloom,$(USEMODULE))) + DIRS += bloom +endif all: $(BINDIR)$(MODULE).a @for i in $(DIRS) ; do $(MAKE) -C $$i ; done ; diff --git a/sys/bloom/Makefile b/sys/bloom/Makefile new file mode 100644 index 0000000000..8210b99fe8 --- /dev/null +++ b/sys/bloom/Makefile @@ -0,0 +1,5 @@ +INCLUDES = -I../include -I$(RIOTBASE)/core/include +MODULE = bloom + +include $(RIOTBASE)/Makefile.base + diff --git a/sys/bloom/bloom.c b/sys/bloom/bloom.c new file mode 100644 index 0000000000..ef12f4ec5d --- /dev/null +++ b/sys/bloom/bloom.c @@ -0,0 +1,267 @@ +/****************************************************************************** + * bloom.c + * ``````` + * Bloom filters + * + * HISTORY + * {x, y, z} + * A Bloom filter is a probibalistic : : : + * data structure with several interesting /|\ /|\ /|\ + * properties, such as low memory usage, / | X | X | \ + * asymmetric query confidence, and a very / |/ \|/ \| \ + * speedy O(k) membership test. / | | \ \ + * / /| /|\ |\ \ + * Because a Bloom filter can . . . . . . . . . + * accept any input that can be 00000000001000101010101010100010000000000 + * hashed effectively (such as " " " + * strings), that membership test \ | / + * tends to draw a crowd. TNSTAAFL, but \ | / + * as caveats go, the Bloom filters' are \ | / + * more interesting than incapacitating. \|/ + * : + * Most notably, it can tell you with certainty {w} + * that an item 'i' is *not* a member of set 's', + * but it can only tell you with some finite + * probability whether an item 'i' *is* a member + * of set 's'. + * + * Still, along with the intriguing possibility of using bitwise AND and OR + * to compute the logical union and intersection of two filters, the cheap + * cost of adding elements to the filter set, and the low memory requirements, + * the Bloom filter is a good choice for many applications. + * + * NOTES + * + * Let's look more closely at the probability values. + * + * Assume that a hash function selects each array position with equal + * probability. If m is the number of bits in the array, and k is the number + * of hash functions, then the probability that a certain bit is not set + * to 1 by a certain hash function during the insertion of an element is + * + * 1-(1/m). + * + * The probability that it is not set to 1 by any of the hash functions is + * + * (1-(1/m))^k. + * + * If we have inserted n elements, the probability that a certain bit is + * set 0 is + * + * (1-(1/m))^kn, + * + * Meaning that the probability said bit is set to 1 is therefore + * + * 1-([1-(1/m)]^kn). + * + * Now test membership of an element that is not in the set. Each of the k + * array positions computed by the hash functions is 1 with a probability + * as above. The probability of all of them being 1, which would cause the + * algorithm to erroneously claim that the element is in the set, is often + * given as + * + * (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k. + * + * This is not strictly correct as it assumes independence for the + * probabilities of each bit being set. However, assuming it is a close + * approximation we have that the probability of false positives descreases + * as m (the number of bits in the array) increases, and increases as n + * (the number of inserted elements) increases. For a given m and n, the + * value of k (the number of hash functions) that minimizes the probability + * is + * + * (m/n)ln(2) ~~ 0.7(m/n), + * + * which gives the false positive probability of + * + * 2^-k ~~ 0.6185^(m/n). + * + * The required number of bits m, given n and a desired false positive + * probability p (and assuming the optimal value of k is used) can be + * computed by substituting the optimal value of k in the probability + * expression above: + * + * p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)), + * + * which simplifies to + * + * ln(p) = -(m/n) * (ln2)^2. + * + * This results in the equation + * + * m = -((n*ln(p)) / ((ln(2))^2)) + * + * The classic filter uses + * + * 1.44*log2(1/eta) + * + * bits of space per inserted key, where eta is the false positive rate of + * the Bloom filter. + * + * AUTHOR + * Jason Linehan (patientulysses@gmail.com) + * + * LICENSE + * Public domain. + * + ******************************************************************************/ + +#include +#include +#include + +#include "bloom.h" + + +#define SETBIT(a,n) (a[n/CHAR_BIT] |= (1<<(n%CHAR_BIT))) +#define GETBIT(a,n) (a[n/CHAR_BIT] & (1<<(n%CHAR_BIT))) +#define ROUND(size) ((size + CHAR_BIT - 1) / CHAR_BIT) + + +/****************************************************************************** + * bloom_new Allocate and return a pointer to a new Bloom filter. + * ````````` + * @size : size of the bit array in the filter + * @nfuncs: the number of hash functions + * Returns: An allocated bloom filter + * + * USAGE + * For best results, make 'size' a power of 2. + * + ******************************************************************************/ +struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...) { + struct bloom_t *bloom; + va_list hashes; + int n; + + /* Allocate Bloom filter container */ + if (!(bloom = malloc(sizeof(struct bloom_t)))) { + return NULL; + } + + /* Allocate Bloom array */ + if (!(bloom->a = calloc(ROUND(size), sizeof(char)))) { + free(bloom); + return NULL; + } + + /* Allocate Bloom filter hash function pointers */ + if (!(bloom->hash = (hashfp_t *)malloc(num_hashes *sizeof(hashfp_t)))) { + free(bloom->a); + free(bloom); + return NULL; + } + + /* Assign hash functions to pointers in the Bloom filter */ + va_start(hashes, num_hashes); + + for (n = 0; n < num_hashes; n++) { + bloom->hash[n] = va_arg(hashes, hashfp_t); + } + + va_end(hashes); + + /* + * Record the number of hash functions (k) and the number of bytes + * in the Bloom array (m). + */ + bloom->k = num_hashes; + bloom->m = size; + + return bloom; +} + + +/****************************************************************************** + * bloom_del Delete a Bloom filter. + * ````````` + * @bloom : The condemned. + * Returns: nothing. + * + ******************************************************************************/ +void bloom_del(struct bloom_t *bloom) +{ + free(bloom->a); + free(bloom->hash); + free(bloom); +} + + +/****************************************************************************** + * bloom_add Add a string to a Bloom filter. + * ````````` + * @bloom : Bloom filter + * @s : string to add + * Returns: nothing. + * + * CAVEAT + * Once a string has been added to the filter, it cannot be "removed"! + * + ******************************************************************************/ +void bloom_add(struct bloom_t *bloom, const char *s) +{ + unsigned int hash; + int n; + + for (n = 0; n < bloom->k; n++) { + hash = (unsigned int)bloom->hash[n](s); + SETBIT(bloom->a, (hash % bloom->m)); + } +} + + +/****************************************************************************** + * bloom_check Determine if a string is in the Bloom filter. + * ``````````` + * @bloom : Bloom filter + * @s : string to add + * Returns: false if string does not exist in the filter, otherwise true. + * + * NOTES + * + * So this is the freakshow that bored programmers pay a nickel to get a + * peek at, step right up. This is the way the membership test works. + * + * The string 's' is hashed once for each of the 'k' hash functions, as + * though we were planning to add it to the filter. Instead of adding it + * however, we examine the bit that we *would* have set, and consider its + * value. + * + * If the bit is 1 (set), the string we are hashing may be in the filter, + * since it would have set this bit when it was originally hashed. However, + * it may also be that another string just happened to produce a hash value + * that would also set this bit. That would be a false positive. This is why + * we have k > 1, so we can minimize the likelihood of false positives + * occuring. + * + * If every bit corresponding to every one of the k hashes of our query + * string is set, we can say with some probability of being correct that + * the string we are holding is indeed "in" the filter. However, we can + * never be sure. + * + * If, however, as we hash our string and peek at the resulting bit in the + * filter, we find the bit is 0 (not set)... well now, that's different. + * In this case, we can say with absolute certainty that the string we are + * holding is *not* in the filter, because if it were, this bit would have + * to be set. + * + * In this way, the Bloom filter can answer NO with absolute surety, but + * can only speak a qualified YES. + * + ******************************************************************************/ +bool bloom_check(struct bloom_t *bloom, const char *s) +{ + unsigned int hash; + int n; + + for (n = 0; n < bloom->k; n++) { + hash = (unsigned int)bloom->hash[n](s); + + if (!(GETBIT(bloom->a, (hash % bloom->m)))) { + return false; + } + } + + return true; /* ? */ +} + diff --git a/sys/include/bloom.h b/sys/include/bloom.h new file mode 100644 index 0000000000..290ebbcf5b --- /dev/null +++ b/sys/include/bloom.h @@ -0,0 +1,22 @@ +#ifndef _BLOOM_FILTER_H +#define _BLOOM_FILTER_H + +#include +#include +#include + +typedef unsigned int (*hashfp_t)(const char *); + +struct bloom_t { + size_t m; + size_t k; + unsigned char *a; + hashfp_t *hash; +}; + +struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...); +void bloom_del(struct bloom_t *bloom); +void bloom_add(struct bloom_t *bloom, const char *s); +bool bloom_check(struct bloom_t *bloom, const char *s); + +#endif diff --git a/sys/include/hashes.h b/sys/include/hashes.h new file mode 100644 index 0000000000..eaf3c19bd3 --- /dev/null +++ b/sys/include/hashes.h @@ -0,0 +1,164 @@ +/****************************************************************************** + * djb2_hash + * ````````` + * HISTORY + * This algorithm (k=33) was first reported by Dan Bernstein many years + * ago in comp.lang.c. Another version of this algorithm (now favored by + * bernstein) uses XOR: + * + * hash(i) = hash(i - 1) * 33 ^ str[i]; + * + * The magic of number 33 (why it works better than many other constants, + * prime or not) has never been adequately explained. + * + ******************************************************************************/ +static inline unsigned long djb2_hash(const char *str) +{ + unsigned long hash; + int c; + + hash = 5381; + + while ((c = (unsigned char) * str++)) { + hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ + } + + return hash; +} + +/****************************************************************************** + * sdbm_hash + * ````````` + * HISTORY + * This algorithm was created for sdbm (a public-domain reimplementation + * of ndbm) database library. It was found to do well in scrambling bits, + * causing better distribution of the keys and fewer splits. it also + * happens to be a good general hashing function with good distribution. + * + * The actual function is + * + * hash(i) = hash(i - 1) * 65599 + str[i]; + * + * What is included below is the faster version used in gawk. [there is + * even a faster, duff-device version] the magic constant 65599 was picked + * out of thin air while experimenting with different constants, and turns + * out to be a prime. this is one of the algorithms used in berkeley db + * (see sleepycat) and elsewhere. + * + ******************************************************************************/ +static inline unsigned long sdbm_hash(const char *str) +{ + unsigned long hash; + int c; + + hash = 0; + + while ((c = (unsigned char) * str++)) { + hash = c + (hash << 6) + (hash << 16) - hash; + } + + return hash; +} + +/****************************************************************************** + * lose lose + * ````````` + * HISTORY + * This hash function appeared in K&R (1st ed) but at least the reader + * was warned: + * + * "This is not the best possible algorithm, but it has the merit + * of extreme simplicity." + * + * This is an understatement. It is a terrible hashing algorithm, and it + * could have been much better without sacrificing its "extreme simplicity." + * [see the second edition!] + * + * Many C programmers use this function without actually testing it, or + * checking something like Knuth's Sorting and Searching, so it stuck. + * It is now found mixed with otherwise respectable code, eg. cnews. sigh. + * [see also: tpop] + * + ******************************************************************************/ +static inline unsigned long kr_hash(const char *str) +{ + unsigned int hash; + unsigned int c; + + hash = 0; + + while ((c = (unsigned char) * str++)) { + hash += c; + } + + return hash; +} + +/****************************************************************************** + * sax_hash + * ```````` + * Shift, Add, XOR + * + ******************************************************************************/ +static inline unsigned int sax_hash(const char *key) +{ + unsigned int h; + + h = 0; + + while (*key) { + h ^= (h << 5) + (h >> 2) + (unsigned char) * key++; + } + + return h; +} + + +/****************************************************************************** + * dek_hash + * ```````` + * HISTORY + * Proposed by Donald E. Knuth in The Art Of Computer Programming Vol. 3, + * under the topic of "Sorting and Search", Chapter 6.4. + * + ******************************************************************************/ +static inline unsigned int dek_hash(const char *str, unsigned int len) +{ + unsigned int hash; + unsigned int c; + + hash = len; + c = 0; + + while ((c = (unsigned int) * str++)) { + hash = ((hash << 5) ^ (hash >> 27)) ^ (c); + } + + return hash; +} + + +/****************************************************************************** + * fnv_hash + * ```````` + * NOTE + * For a more fully featured and modern version of this hash, see fnv32.c + * + ******************************************************************************/ +static inline unsigned int fnv_hash(const char *str) +{ +#define FNV_PRIME 0x811C9DC5 + unsigned int hash; + unsigned int c; + + hash = 0; + c = 0; + + while ((c = (unsigned int) * str++)) { + hash *= FNV_PRIME; + hash ^= (c); + } + + return hash; +} +