moved doc to header and converted it to doxygen

2024-12-29 04:50:03 +01:00 · 2013-08-10 12:06:09 +02:00 · 2013-08-10 12:06:09 +02:00 · 0fb5e89c89
commit 0fb5e89c89
parent 5a45d15894
3 changed files with 228 additions and 199 deletions
--- a/sys/bloom/bloom.c
+++ b/sys/bloom/bloom.c
@ -1,110 +1,18 @@
-/******************************************************************************
+/**
- * bloom.c
+ * Bloom filter implementation
 * ```````
 * Bloom filters
 *
- * HISTORY
+ * Copyright (C) 2013 Freie Universität Berlin
 *                                                   {x,  y,  z}
 * A Bloom filter is a probibalistic                  :   :   :
 * data structure with several interesting           /|\ /|\ /|\
 * properties, such as low memory usage,            / | X | X | \
 * asymmetric query confidence, and a very         /  |/ \|/ \|  \
 * speedy O(k) membership test.                   /   |   |   \   \
 *                                               /   /|  /|\  |\   \
 * Because a Bloom filter can                   .   . . . . . . .   .
 * accept any input that can be       00000000001000101010101010100010000000000
 * hashed effectively (such as                       "    "    "
 * strings), that membership test                     \   |   /
 * tends to draw a crowd. TNSTAAFL, but                \  |  /
 * as caveats go, the Bloom filters' are                \ | /
 * more interesting than incapacitating.                 \|/
 *                                                        :
 * Most notably, it can tell you with certainty          {w}
 * that an item 'i' is *not* a member of set 's',
 * but it can only tell you with some finite
 * probability whether an item 'i' *is* a member
 * of set 's'.
 *
- * Still, along with the intriguing possibility of using bitwise AND and OR
+ * This file subject to the terms and conditions of the GNU Lesser General
- * to compute the logical union and intersection of two filters, the cheap
+ * Public License. See the file LICENSE in the top level directory for more
- * cost of adding elements to the filter set, and the low memory requirements,
+ * details.
 * the Bloom filter is a good choice for many applications.
 *
- * NOTES
+ * @file
 * @autor Jason Linehan <patientulysses@gmail.com>
 * @autor Christian Mehlis <mehlis@inf.fu-berlin.de>
 * @autor Freie Universität Berlin, Computer Systems & Telematics
 *
- * Let's look more closely at the probability values.
+ */
 *
 * Assume that a hash function selects each array position with equal
 * probability. If m is the number of bits in the array, and k is the number
 * of hash functions, then the probability that a certain bit is not set
 * to 1 by a certain hash function during the insertion of an element is
 *
 *      1-(1/m).
 *
 * The probability that it is not set to 1 by any of the hash functions is
 *
 *      (1-(1/m))^k.
 *
 * If we have inserted n elements, the probability that a certain bit is
 * set 0 is
 *
 *      (1-(1/m))^kn,
 *
 * Meaning that the probability said bit is set to 1 is therefore
 *
 *      1-([1-(1/m)]^kn).
 *
 * Now test membership of an element that is not in the set. Each of the k
 * array positions computed by the hash functions is 1 with a probability
 * as above. The probability of all of them being 1, which would cause the
 * algorithm to erroneously claim that the element is in the set, is often
 * given as
 *
 *      (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.
 *
 * This is not strictly correct as it assumes independence for the
 * probabilities of each bit being set. However, assuming it is a close
 * approximation we have that the probability of false positives descreases
 * as m (the number of bits in the array) increases, and increases as n
 * (the number of inserted elements) increases. For a given m and n, the
 * value of k (the number of hash functions) that minimizes the probability
 * is
 *
 *      (m/n)ln(2) ~~ 0.7(m/n),
 *
 * which gives the false positive probability of
 *
 *      2^-k ~~ 0.6185^(m/n).
 *
 * The required number of bits m, given n and a desired false positive
 * probability p (and assuming the optimal value of k is used) can be
 * computed by substituting the optimal value of k in the probability
 * expression above:
 *
 *      p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),
 *
 * which simplifies to
 *
 *      ln(p) = -(m/n) * (ln2)^2.
 *
 * This results in the equation
 *
 *      m = -((n*ln(p)) / ((ln(2))^2))
 *
 * The classic filter uses
 *
 *       1.44*log2(1/eta)
 *
 * bits of space per inserted key, where eta is the false positive rate of
 * the Bloom filter.
 *
 * AUTHOR
 * Jason Linehan (patientulysses@gmail.com)
 *
 * LICENSE
 * Public domain.
 *
 ******************************************************************************/
 #include <limits.h>
 #include <stdarg.h>
@ -112,23 +20,10 @@
 #include "bloom.h"
 #define SETBIT(a,n) (a[n/CHAR_BIT] |= (1<<(n%CHAR_BIT)))
 #define GETBIT(a,n) (a[n/CHAR_BIT] &  (1<<(n%CHAR_BIT)))
 #define ROUND(size) ((size + CHAR_BIT - 1) / CHAR_BIT)
 /******************************************************************************
 * bloom_new  Allocate and return a pointer to a new Bloom filter.
 * `````````
 * @size  : size of the bit array in the filter
 * @nfuncs: the number of hash functions
 * Returns: An allocated bloom filter
 *
 * USAGE
 * For best results, make 'size' a power of 2.
 *
 ******************************************************************************/
 struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...) {
    struct bloom_t *bloom;
    va_list hashes;
@ -171,14 +66,6 @@ struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...) {
    return bloom;
 }
 /******************************************************************************
 * bloom_del  Delete a Bloom filter.
 * `````````
 * @bloom : The condemned.
 * Returns: nothing.
 *
 ******************************************************************************/
 void bloom_del(struct bloom_t *bloom)
 {
    free(bloom->a);
@ -186,18 +73,6 @@ void bloom_del(struct bloom_t *bloom)
    free(bloom);
 }
 /******************************************************************************
 * bloom_add  Add a string to a Bloom filter.
 * `````````
 * @bloom : Bloom filter
 * @s     : string to add
 * Returns: nothing.
 *
 * CAVEAT
 * Once a string has been added to the filter, it cannot be "removed"!
 *
 ******************************************************************************/
 void bloom_add(struct bloom_t *bloom, const char *s)
 {
    unsigned int hash;
@ -209,46 +84,6 @@ void bloom_add(struct bloom_t *bloom, const char *s)
    }
 }
 /******************************************************************************
 * bloom_check  Determine if a string is in the Bloom filter.
 * ```````````
 * @bloom : Bloom filter
 * @s     : string to add
 * Returns: false if string does not exist in the filter, otherwise true.
 *
 * NOTES
 *
 * So this is the freakshow that bored programmers pay a nickel to get a
 * peek at, step right up. This is the way the membership test works.
 *
 * The string 's' is hashed once for each of the 'k' hash functions, as
 * though we were planning to add it to the filter. Instead of adding it
 * however, we examine the bit that we *would* have set, and consider its
 * value.
 *
 * If the bit is 1 (set), the string we are hashing may be in the filter,
 * since it would have set this bit when it was originally hashed. However,
 * it may also be that another string just happened to produce a hash value
 * that would also set this bit. That would be a false positive. This is why
 * we have k > 1, so we can minimize the likelihood of false positives
 * occuring.
 *
 * If every bit corresponding to every one of the k hashes of our query
 * string is set, we can say with some probability of being correct that
 * the string we are holding is indeed "in" the filter. However, we can
 * never be sure.
 *
 * If, however, as we hash our string and peek at the resulting bit in the
 * filter, we find the bit is 0 (not set)... well now, that's different.
 * In this case, we can say with absolute certainty that the string we are
 * holding is *not* in the filter, because if it were, this bit would have
 * to be set.
 *
 * In this way, the Bloom filter can answer NO with absolute surety, but
 * can only speak a qualified YES.
 *
 ******************************************************************************/
 bool bloom_check(struct bloom_t *bloom, const char *s)
 {
    unsigned int hash;
--- a/sys/include/bloom.h
+++ b/sys/include/bloom.h
@ -1,3 +1,111 @@
 /**
 * bloom.c
 *
 * Bloom filters
 *
 * HISTORY
 *                                                   {x,  y,  z}
 * A Bloom filter is a probibalistic                  :   :   :
 * data structure with several interesting           /|\ /|\ /|\
 * properties, such as low memory usage,            / | X | X | \
 * asymmetric query confidence, and a very         /  |/ \|/ \|  \
 * speedy O(k) membership test.                   /   |   |   \   \
 *                                               /   /|  /|\  |\   \
 * Because a Bloom filter can                   .   . . . . . . .   .
 * accept any input that can be       00000000001000101010101010100010000000000
 * hashed effectively (such as                       "    "    "
 * strings), that membership test                     \   |   /
 * tends to draw a crowd. TNSTAAFL, but                \  |  /
 * as caveats go, the Bloom filters' are                \ | /
 * more interesting than incapacitating.                 \|/
 *                                                        :
 * Most notably, it can tell you with certainty          {w}
 * that an item 'i' is *not* a member of set 's',
 * but it can only tell you with some finite
 * probability whether an item 'i' *is* a member
 * of set 's'.
 *
 * Still, along with the intriguing possibility of using bitwise AND and OR
 * to compute the logical union and intersection of two filters, the cheap
 * cost of adding elements to the filter set, and the low memory requirements,
 * the Bloom filter is a good choice for many applications.
 *
 * NOTES
 *
 * Let's look more closely at the probability values.
 *
 * Assume that a hash function selects each array position with equal
 * probability. If m is the number of bits in the array, and k is the number
 * of hash functions, then the probability that a certain bit is not set
 * to 1 by a certain hash function during the insertion of an element is
 *
 *      1-(1/m).
 *
 * The probability that it is not set to 1 by any of the hash functions is
 *
 *      (1-(1/m))^k.
 *
 * If we have inserted n elements, the probability that a certain bit is
 * set 0 is
 *
 *      (1-(1/m))^kn,
 *
 * Meaning that the probability said bit is set to 1 is therefore
 *
 *      1-([1-(1/m)]^kn).
 *
 * Now test membership of an element that is not in the set. Each of the k
 * array positions computed by the hash functions is 1 with a probability
 * as above. The probability of all of them being 1, which would cause the
 * algorithm to erroneously claim that the element is in the set, is often
 * given as
 *
 *      (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.
 *
 * This is not strictly correct as it assumes independence for the
 * probabilities of each bit being set. However, assuming it is a close
 * approximation we have that the probability of false positives descreases
 * as m (the number of bits in the array) increases, and increases as n
 * (the number of inserted elements) increases. For a given m and n, the
 * value of k (the number of hash functions) that minimizes the probability
 * is
 *
 *      (m/n)ln(2) ~~ 0.7(m/n),
 *
 * which gives the false positive probability of
 *
 *      2^-k ~~ 0.6185^(m/n).
 *
 * The required number of bits m, given n and a desired false positive
 * probability p (and assuming the optimal value of k is used) can be
 * computed by substituting the optimal value of k in the probability
 * expression above:
 *
 *      p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),
 *
 * which simplifies to
 *
 *      ln(p) = -(m/n) * (ln2)^2.
 *
 * This results in the equation
 *
 *      m = -((n*ln(p)) / ((ln(2))^2))
 *
 * The classic filter uses
 *
 *       1.44*log2(1/eta)
 *
 * bits of space per inserted key, where eta is the false positive rate of
 * the Bloom filter.
 *
 */
 /**
 * @file
 * @autor Christian Mehlis <mehlis@inf.fu-berlin.de>
 * @autor Freie Universität Berlin, Computer Systems & Telematics
 */
 #ifndef _BLOOM_FILTER_H
 #define _BLOOM_FILTER_H
@ -5,8 +113,14 @@
 #include <stdbool.h>
 #include <stdint.h>
 /**
 * hashfp_t  hash function to use in thee filter
 */
 typedef unsigned int (*hashfp_t)(const char *);
 /**
 * struct bloom_t bloom filter object
 */
 struct bloom_t {
    size_t m;
    size_t k;
@ -14,9 +128,77 @@ struct bloom_t {
    hashfp_t *hash;
 };
 /**
 * bloom_new  Allocate and return a pointer to a new Bloom filter.
 *
 * For best results, make 'size' a power of 2.
 *
 * @param size        size of the bit array in the filter
 * @param num_hashes  the number of hash functions
 * @param functions   varg function pointers, use hashfp_t
 *
 * @return An allocated bloom filter
 *
 */
 struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...);
 /**
 * bloom_del  Delete a Bloom filter.
 *
 * @param bloom The condemned
 * @return nothing
 *
 */
 void bloom_del(struct bloom_t *bloom);
 /**
 * bloom_add  Add a string to a Bloom filter.
 *
 * CAVEAT
 * Once a string has been added to the filter, it cannot be "removed"!
 *
 * @param bloom  Bloom filter
 * @param s      string to add
 * @return       nothing
 *
 */
 void bloom_add(struct bloom_t *bloom, const char *s);
 /**
 * bloom_check  Determine if a string is in the Bloom filter.
 *
 * The string 's' is hashed once for each of the 'k' hash functions, as
 * though we were planning to add it to the filter. Instead of adding it
 * however, we examine the bit that we *would* have set, and consider its
 * value.
 *
 * If the bit is 1 (set), the string we are hashing may be in the filter,
 * since it would have set this bit when it was originally hashed. However,
 * it may also be that another string just happened to produce a hash value
 * that would also set this bit. That would be a false positive. This is why
 * we have k > 1, so we can minimize the likelihood of false positives
 * occuring.
 *
 * If every bit corresponding to every one of the k hashes of our query
 * string is set, we can say with some probability of being correct that
 * the string we are holding is indeed "in" the filter. However, we can
 * never be sure.
 *
 * If, however, as we hash our string and peek at the resulting bit in the
 * filter, we find the bit is 0 (not set)... well now, that's different.
 * In this case, we can say with absolute certainty that the string we are
 * holding is *not* in the filter, because if it were, this bit would have
 * to be set.
 *
 * In this way, the Bloom filter can answer NO with absolute surety, but
 * can only speak a qualified YES.
 *
 * @param bloom  Bloom filter
 * @param s      string to check
 * @return       false if string does not exist in the filter
 * @return       true if string is may be in the filter
 *
 */
 bool bloom_check(struct bloom_t *bloom, const char *s);
 #endif
--- a/sys/include/hashes.h
+++ b/sys/include/hashes.h
@ -1,6 +1,23 @@
-/******************************************************************************
+/**
 * This file contains some simple hash function
 *
 * Copyright (C) 2013 Freie Universität Berlin
 *
 * This file subject to the terms and conditions of the GNU Lesser General
 * Public License. See the file LICENSE in the top level directory for more
 * details.
 */
 /**
 * @file
 * @autor       Jason Linehan <patientulysses@gmail.com>
 * @author      Freie Universität Berlin, Computer Systems & Telematics
 * @author      Christian Mehlis <mehlis@inf.fu-berlin.de>
 */
 /**
 * djb2_hash
- * `````````
+ *
 * HISTORY
 * This algorithm (k=33) was first reported by Dan Bernstein many years
 * ago in comp.lang.c. Another version of this algorithm (now favored by
@ -10,8 +27,7 @@
 *
 * The magic of number 33 (why it works better than many other constants,
 * prime or not) has never been adequately explained.
- *
+ */
 ******************************************************************************/
 static inline unsigned long djb2_hash(const char *str)
 {
    unsigned long hash;
@ -26,9 +42,9 @@ static inline unsigned long djb2_hash(const char *str)
    return hash;
 }
-/******************************************************************************
+/**
 * sdbm_hash
- * `````````
+ *
 * HISTORY
 * This algorithm was created for sdbm (a public-domain reimplementation
 * of ndbm) database library. It was found to do well in scrambling bits,
@ -45,7 +61,7 @@ static inline unsigned long djb2_hash(const char *str)
 * out to be a prime. this is one of the algorithms used in berkeley db
 * (see sleepycat) and elsewhere.
 *
- ******************************************************************************/
+ */
 static inline unsigned long sdbm_hash(const char *str)
 {
    unsigned long hash;
@ -60,9 +76,9 @@ static inline unsigned long sdbm_hash(const char *str)
    return hash;
 }
-/******************************************************************************
+/**
 * lose lose
- * `````````
+ *
 * HISTORY
 * This hash function appeared in K&R (1st ed) but at least the reader
 * was warned:
@ -78,8 +94,7 @@ static inline unsigned long sdbm_hash(const char *str)
 * checking something like Knuth's Sorting and Searching, so it stuck.
 * It is now found mixed with otherwise respectable code, eg. cnews. sigh.
 * [see also: tpop]
- *
+ */
 ******************************************************************************/
 static inline unsigned long kr_hash(const char *str)
 {
    unsigned int hash;
@ -94,12 +109,11 @@ static inline unsigned long kr_hash(const char *str)
    return hash;
 }
-/******************************************************************************
+/**
 * sax_hash
 * ````````
 * Shift, Add, XOR
 *
- ******************************************************************************/
+ * Shift, Add, XOR
 */
 static inline unsigned int sax_hash(const char *key)
 {
    unsigned int h;
@ -114,14 +128,13 @@ static inline unsigned int sax_hash(const char *key)
 }
-/******************************************************************************
+/**
 * dek_hash
- * ````````
+ *
 * HISTORY
 * Proposed by Donald E. Knuth in The Art Of Computer Programming Vol. 3,
 * under the topic of "Sorting and Search", Chapter 6.4.
- *
+ */
 ******************************************************************************/
 static inline unsigned int dek_hash(const char *str, unsigned int len)
 {
    unsigned int hash;
@ -138,13 +151,12 @@ static inline unsigned int dek_hash(const char *str, unsigned int len)
 }
-/******************************************************************************
+/**
 * fnv_hash
- * ````````
+ *
 * NOTE
 * For a more fully featured and modern version of this hash, see fnv32.c
- *
+ */
 ******************************************************************************/
 static inline unsigned int fnv_hash(const char *str)
 {
 #define FNV_PRIME 0x811C9DC5