1
0
mirror of https://github.com/RIOT-OS/RIOT.git synced 2024-12-29 04:50:03 +01:00

moved doc to header and converted it to doxygen

This commit is contained in:
Christian Mehlis 2013-08-10 12:06:09 +02:00
parent 5a45d15894
commit 0fb5e89c89
3 changed files with 228 additions and 199 deletions

View File

@ -1,110 +1,18 @@
/****************************************************************************** /**
* bloom.c * Bloom filter implementation
* ```````
* Bloom filters
* *
* HISTORY * Copyright (C) 2013 Freie Universität Berlin
* {x, y, z}
* A Bloom filter is a probibalistic : : :
* data structure with several interesting /|\ /|\ /|\
* properties, such as low memory usage, / | X | X | \
* asymmetric query confidence, and a very / |/ \|/ \| \
* speedy O(k) membership test. / | | \ \
* / /| /|\ |\ \
* Because a Bloom filter can . . . . . . . . .
* accept any input that can be 00000000001000101010101010100010000000000
* hashed effectively (such as " " "
* strings), that membership test \ | /
* tends to draw a crowd. TNSTAAFL, but \ | /
* as caveats go, the Bloom filters' are \ | /
* more interesting than incapacitating. \|/
* :
* Most notably, it can tell you with certainty {w}
* that an item 'i' is *not* a member of set 's',
* but it can only tell you with some finite
* probability whether an item 'i' *is* a member
* of set 's'.
* *
* Still, along with the intriguing possibility of using bitwise AND and OR * This file subject to the terms and conditions of the GNU Lesser General
* to compute the logical union and intersection of two filters, the cheap * Public License. See the file LICENSE in the top level directory for more
* cost of adding elements to the filter set, and the low memory requirements, * details.
* the Bloom filter is a good choice for many applications.
* *
* NOTES * @file
* @autor Jason Linehan <patientulysses@gmail.com>
* @autor Christian Mehlis <mehlis@inf.fu-berlin.de>
* @autor Freie Universität Berlin, Computer Systems & Telematics
* *
* Let's look more closely at the probability values. */
*
* Assume that a hash function selects each array position with equal
* probability. If m is the number of bits in the array, and k is the number
* of hash functions, then the probability that a certain bit is not set
* to 1 by a certain hash function during the insertion of an element is
*
* 1-(1/m).
*
* The probability that it is not set to 1 by any of the hash functions is
*
* (1-(1/m))^k.
*
* If we have inserted n elements, the probability that a certain bit is
* set 0 is
*
* (1-(1/m))^kn,
*
* Meaning that the probability said bit is set to 1 is therefore
*
* 1-([1-(1/m)]^kn).
*
* Now test membership of an element that is not in the set. Each of the k
* array positions computed by the hash functions is 1 with a probability
* as above. The probability of all of them being 1, which would cause the
* algorithm to erroneously claim that the element is in the set, is often
* given as
*
* (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.
*
* This is not strictly correct as it assumes independence for the
* probabilities of each bit being set. However, assuming it is a close
* approximation we have that the probability of false positives descreases
* as m (the number of bits in the array) increases, and increases as n
* (the number of inserted elements) increases. For a given m and n, the
* value of k (the number of hash functions) that minimizes the probability
* is
*
* (m/n)ln(2) ~~ 0.7(m/n),
*
* which gives the false positive probability of
*
* 2^-k ~~ 0.6185^(m/n).
*
* The required number of bits m, given n and a desired false positive
* probability p (and assuming the optimal value of k is used) can be
* computed by substituting the optimal value of k in the probability
* expression above:
*
* p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),
*
* which simplifies to
*
* ln(p) = -(m/n) * (ln2)^2.
*
* This results in the equation
*
* m = -((n*ln(p)) / ((ln(2))^2))
*
* The classic filter uses
*
* 1.44*log2(1/eta)
*
* bits of space per inserted key, where eta is the false positive rate of
* the Bloom filter.
*
* AUTHOR
* Jason Linehan (patientulysses@gmail.com)
*
* LICENSE
* Public domain.
*
******************************************************************************/
#include <limits.h> #include <limits.h>
#include <stdarg.h> #include <stdarg.h>
@ -112,23 +20,10 @@
#include "bloom.h" #include "bloom.h"
#define SETBIT(a,n) (a[n/CHAR_BIT] |= (1<<(n%CHAR_BIT))) #define SETBIT(a,n) (a[n/CHAR_BIT] |= (1<<(n%CHAR_BIT)))
#define GETBIT(a,n) (a[n/CHAR_BIT] & (1<<(n%CHAR_BIT))) #define GETBIT(a,n) (a[n/CHAR_BIT] & (1<<(n%CHAR_BIT)))
#define ROUND(size) ((size + CHAR_BIT - 1) / CHAR_BIT) #define ROUND(size) ((size + CHAR_BIT - 1) / CHAR_BIT)
/******************************************************************************
* bloom_new Allocate and return a pointer to a new Bloom filter.
* `````````
* @size : size of the bit array in the filter
* @nfuncs: the number of hash functions
* Returns: An allocated bloom filter
*
* USAGE
* For best results, make 'size' a power of 2.
*
******************************************************************************/
struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...) { struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...) {
struct bloom_t *bloom; struct bloom_t *bloom;
va_list hashes; va_list hashes;
@ -171,14 +66,6 @@ struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...) {
return bloom; return bloom;
} }
/******************************************************************************
* bloom_del Delete a Bloom filter.
* `````````
* @bloom : The condemned.
* Returns: nothing.
*
******************************************************************************/
void bloom_del(struct bloom_t *bloom) void bloom_del(struct bloom_t *bloom)
{ {
free(bloom->a); free(bloom->a);
@ -186,18 +73,6 @@ void bloom_del(struct bloom_t *bloom)
free(bloom); free(bloom);
} }
/******************************************************************************
* bloom_add Add a string to a Bloom filter.
* `````````
* @bloom : Bloom filter
* @s : string to add
* Returns: nothing.
*
* CAVEAT
* Once a string has been added to the filter, it cannot be "removed"!
*
******************************************************************************/
void bloom_add(struct bloom_t *bloom, const char *s) void bloom_add(struct bloom_t *bloom, const char *s)
{ {
unsigned int hash; unsigned int hash;
@ -209,46 +84,6 @@ void bloom_add(struct bloom_t *bloom, const char *s)
} }
} }
/******************************************************************************
* bloom_check Determine if a string is in the Bloom filter.
* ```````````
* @bloom : Bloom filter
* @s : string to add
* Returns: false if string does not exist in the filter, otherwise true.
*
* NOTES
*
* So this is the freakshow that bored programmers pay a nickel to get a
* peek at, step right up. This is the way the membership test works.
*
* The string 's' is hashed once for each of the 'k' hash functions, as
* though we were planning to add it to the filter. Instead of adding it
* however, we examine the bit that we *would* have set, and consider its
* value.
*
* If the bit is 1 (set), the string we are hashing may be in the filter,
* since it would have set this bit when it was originally hashed. However,
* it may also be that another string just happened to produce a hash value
* that would also set this bit. That would be a false positive. This is why
* we have k > 1, so we can minimize the likelihood of false positives
* occuring.
*
* If every bit corresponding to every one of the k hashes of our query
* string is set, we can say with some probability of being correct that
* the string we are holding is indeed "in" the filter. However, we can
* never be sure.
*
* If, however, as we hash our string and peek at the resulting bit in the
* filter, we find the bit is 0 (not set)... well now, that's different.
* In this case, we can say with absolute certainty that the string we are
* holding is *not* in the filter, because if it were, this bit would have
* to be set.
*
* In this way, the Bloom filter can answer NO with absolute surety, but
* can only speak a qualified YES.
*
******************************************************************************/
bool bloom_check(struct bloom_t *bloom, const char *s) bool bloom_check(struct bloom_t *bloom, const char *s)
{ {
unsigned int hash; unsigned int hash;

View File

@ -1,3 +1,111 @@
/**
* bloom.c
*
* Bloom filters
*
* HISTORY
* {x, y, z}
* A Bloom filter is a probibalistic : : :
* data structure with several interesting /|\ /|\ /|\
* properties, such as low memory usage, / | X | X | \
* asymmetric query confidence, and a very / |/ \|/ \| \
* speedy O(k) membership test. / | | \ \
* / /| /|\ |\ \
* Because a Bloom filter can . . . . . . . . .
* accept any input that can be 00000000001000101010101010100010000000000
* hashed effectively (such as " " "
* strings), that membership test \ | /
* tends to draw a crowd. TNSTAAFL, but \ | /
* as caveats go, the Bloom filters' are \ | /
* more interesting than incapacitating. \|/
* :
* Most notably, it can tell you with certainty {w}
* that an item 'i' is *not* a member of set 's',
* but it can only tell you with some finite
* probability whether an item 'i' *is* a member
* of set 's'.
*
* Still, along with the intriguing possibility of using bitwise AND and OR
* to compute the logical union and intersection of two filters, the cheap
* cost of adding elements to the filter set, and the low memory requirements,
* the Bloom filter is a good choice for many applications.
*
* NOTES
*
* Let's look more closely at the probability values.
*
* Assume that a hash function selects each array position with equal
* probability. If m is the number of bits in the array, and k is the number
* of hash functions, then the probability that a certain bit is not set
* to 1 by a certain hash function during the insertion of an element is
*
* 1-(1/m).
*
* The probability that it is not set to 1 by any of the hash functions is
*
* (1-(1/m))^k.
*
* If we have inserted n elements, the probability that a certain bit is
* set 0 is
*
* (1-(1/m))^kn,
*
* Meaning that the probability said bit is set to 1 is therefore
*
* 1-([1-(1/m)]^kn).
*
* Now test membership of an element that is not in the set. Each of the k
* array positions computed by the hash functions is 1 with a probability
* as above. The probability of all of them being 1, which would cause the
* algorithm to erroneously claim that the element is in the set, is often
* given as
*
* (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.
*
* This is not strictly correct as it assumes independence for the
* probabilities of each bit being set. However, assuming it is a close
* approximation we have that the probability of false positives descreases
* as m (the number of bits in the array) increases, and increases as n
* (the number of inserted elements) increases. For a given m and n, the
* value of k (the number of hash functions) that minimizes the probability
* is
*
* (m/n)ln(2) ~~ 0.7(m/n),
*
* which gives the false positive probability of
*
* 2^-k ~~ 0.6185^(m/n).
*
* The required number of bits m, given n and a desired false positive
* probability p (and assuming the optimal value of k is used) can be
* computed by substituting the optimal value of k in the probability
* expression above:
*
* p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),
*
* which simplifies to
*
* ln(p) = -(m/n) * (ln2)^2.
*
* This results in the equation
*
* m = -((n*ln(p)) / ((ln(2))^2))
*
* The classic filter uses
*
* 1.44*log2(1/eta)
*
* bits of space per inserted key, where eta is the false positive rate of
* the Bloom filter.
*
*/
/**
* @file
* @autor Christian Mehlis <mehlis@inf.fu-berlin.de>
* @autor Freie Universität Berlin, Computer Systems & Telematics
*/
#ifndef _BLOOM_FILTER_H #ifndef _BLOOM_FILTER_H
#define _BLOOM_FILTER_H #define _BLOOM_FILTER_H
@ -5,8 +113,14 @@
#include <stdbool.h> #include <stdbool.h>
#include <stdint.h> #include <stdint.h>
/**
* hashfp_t hash function to use in thee filter
*/
typedef unsigned int (*hashfp_t)(const char *); typedef unsigned int (*hashfp_t)(const char *);
/**
* struct bloom_t bloom filter object
*/
struct bloom_t { struct bloom_t {
size_t m; size_t m;
size_t k; size_t k;
@ -14,9 +128,77 @@ struct bloom_t {
hashfp_t *hash; hashfp_t *hash;
}; };
/**
* bloom_new Allocate and return a pointer to a new Bloom filter.
*
* For best results, make 'size' a power of 2.
*
* @param size size of the bit array in the filter
* @param num_hashes the number of hash functions
* @param functions varg function pointers, use hashfp_t
*
* @return An allocated bloom filter
*
*/
struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...); struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...);
/**
* bloom_del Delete a Bloom filter.
*
* @param bloom The condemned
* @return nothing
*
*/
void bloom_del(struct bloom_t *bloom); void bloom_del(struct bloom_t *bloom);
/**
* bloom_add Add a string to a Bloom filter.
*
* CAVEAT
* Once a string has been added to the filter, it cannot be "removed"!
*
* @param bloom Bloom filter
* @param s string to add
* @return nothing
*
*/
void bloom_add(struct bloom_t *bloom, const char *s); void bloom_add(struct bloom_t *bloom, const char *s);
/**
* bloom_check Determine if a string is in the Bloom filter.
*
* The string 's' is hashed once for each of the 'k' hash functions, as
* though we were planning to add it to the filter. Instead of adding it
* however, we examine the bit that we *would* have set, and consider its
* value.
*
* If the bit is 1 (set), the string we are hashing may be in the filter,
* since it would have set this bit when it was originally hashed. However,
* it may also be that another string just happened to produce a hash value
* that would also set this bit. That would be a false positive. This is why
* we have k > 1, so we can minimize the likelihood of false positives
* occuring.
*
* If every bit corresponding to every one of the k hashes of our query
* string is set, we can say with some probability of being correct that
* the string we are holding is indeed "in" the filter. However, we can
* never be sure.
*
* If, however, as we hash our string and peek at the resulting bit in the
* filter, we find the bit is 0 (not set)... well now, that's different.
* In this case, we can say with absolute certainty that the string we are
* holding is *not* in the filter, because if it were, this bit would have
* to be set.
*
* In this way, the Bloom filter can answer NO with absolute surety, but
* can only speak a qualified YES.
*
* @param bloom Bloom filter
* @param s string to check
* @return false if string does not exist in the filter
* @return true if string is may be in the filter
*
*/
bool bloom_check(struct bloom_t *bloom, const char *s); bool bloom_check(struct bloom_t *bloom, const char *s);
#endif #endif

View File

@ -1,6 +1,23 @@
/****************************************************************************** /**
* This file contains some simple hash function
*
* Copyright (C) 2013 Freie Universität Berlin
*
* This file subject to the terms and conditions of the GNU Lesser General
* Public License. See the file LICENSE in the top level directory for more
* details.
*/
/**
* @file
* @autor Jason Linehan <patientulysses@gmail.com>
* @author Freie Universität Berlin, Computer Systems & Telematics
* @author Christian Mehlis <mehlis@inf.fu-berlin.de>
*/
/**
* djb2_hash * djb2_hash
* ````````` *
* HISTORY * HISTORY
* This algorithm (k=33) was first reported by Dan Bernstein many years * This algorithm (k=33) was first reported by Dan Bernstein many years
* ago in comp.lang.c. Another version of this algorithm (now favored by * ago in comp.lang.c. Another version of this algorithm (now favored by
@ -10,8 +27,7 @@
* *
* The magic of number 33 (why it works better than many other constants, * The magic of number 33 (why it works better than many other constants,
* prime or not) has never been adequately explained. * prime or not) has never been adequately explained.
* */
******************************************************************************/
static inline unsigned long djb2_hash(const char *str) static inline unsigned long djb2_hash(const char *str)
{ {
unsigned long hash; unsigned long hash;
@ -26,9 +42,9 @@ static inline unsigned long djb2_hash(const char *str)
return hash; return hash;
} }
/****************************************************************************** /**
* sdbm_hash * sdbm_hash
* ````````` *
* HISTORY * HISTORY
* This algorithm was created for sdbm (a public-domain reimplementation * This algorithm was created for sdbm (a public-domain reimplementation
* of ndbm) database library. It was found to do well in scrambling bits, * of ndbm) database library. It was found to do well in scrambling bits,
@ -45,7 +61,7 @@ static inline unsigned long djb2_hash(const char *str)
* out to be a prime. this is one of the algorithms used in berkeley db * out to be a prime. this is one of the algorithms used in berkeley db
* (see sleepycat) and elsewhere. * (see sleepycat) and elsewhere.
* *
******************************************************************************/ */
static inline unsigned long sdbm_hash(const char *str) static inline unsigned long sdbm_hash(const char *str)
{ {
unsigned long hash; unsigned long hash;
@ -60,9 +76,9 @@ static inline unsigned long sdbm_hash(const char *str)
return hash; return hash;
} }
/****************************************************************************** /**
* lose lose * lose lose
* ````````` *
* HISTORY * HISTORY
* This hash function appeared in K&R (1st ed) but at least the reader * This hash function appeared in K&R (1st ed) but at least the reader
* was warned: * was warned:
@ -78,8 +94,7 @@ static inline unsigned long sdbm_hash(const char *str)
* checking something like Knuth's Sorting and Searching, so it stuck. * checking something like Knuth's Sorting and Searching, so it stuck.
* It is now found mixed with otherwise respectable code, eg. cnews. sigh. * It is now found mixed with otherwise respectable code, eg. cnews. sigh.
* [see also: tpop] * [see also: tpop]
* */
******************************************************************************/
static inline unsigned long kr_hash(const char *str) static inline unsigned long kr_hash(const char *str)
{ {
unsigned int hash; unsigned int hash;
@ -94,12 +109,11 @@ static inline unsigned long kr_hash(const char *str)
return hash; return hash;
} }
/****************************************************************************** /**
* sax_hash * sax_hash
* ````````
* Shift, Add, XOR
* *
******************************************************************************/ * Shift, Add, XOR
*/
static inline unsigned int sax_hash(const char *key) static inline unsigned int sax_hash(const char *key)
{ {
unsigned int h; unsigned int h;
@ -114,14 +128,13 @@ static inline unsigned int sax_hash(const char *key)
} }
/****************************************************************************** /**
* dek_hash * dek_hash
* ```````` *
* HISTORY * HISTORY
* Proposed by Donald E. Knuth in The Art Of Computer Programming Vol. 3, * Proposed by Donald E. Knuth in The Art Of Computer Programming Vol. 3,
* under the topic of "Sorting and Search", Chapter 6.4. * under the topic of "Sorting and Search", Chapter 6.4.
* */
******************************************************************************/
static inline unsigned int dek_hash(const char *str, unsigned int len) static inline unsigned int dek_hash(const char *str, unsigned int len)
{ {
unsigned int hash; unsigned int hash;
@ -138,13 +151,12 @@ static inline unsigned int dek_hash(const char *str, unsigned int len)
} }
/****************************************************************************** /**
* fnv_hash * fnv_hash
* ```````` *
* NOTE * NOTE
* For a more fully featured and modern version of this hash, see fnv32.c * For a more fully featured and modern version of this hash, see fnv32.c
* */
******************************************************************************/
static inline unsigned int fnv_hash(const char *str) static inline unsigned int fnv_hash(const char *str)
{ {
#define FNV_PRIME 0x811C9DC5 #define FNV_PRIME 0x811C9DC5