From 7d60132816341e15a853f2f66f06d0755d0f2daa Mon Sep 17 00:00:00 2001 From: "Rob Kendrick (humdrum)" Date: Thu, 16 May 2013 17:15:29 +0100 Subject: [PATCH] Painfully trivial bloom filter implementation to experiment with optimising urldb lookups --- utils/bloom.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++ utils/bloom.h | 99 ++++++++++++++++++++++++++++++ 2 files changed, 261 insertions(+) create mode 100644 utils/bloom.c create mode 100644 utils/bloom.h diff --git a/utils/bloom.c b/utils/bloom.c new file mode 100644 index 000000000..e6b9dcf92 --- /dev/null +++ b/utils/bloom.c @@ -0,0 +1,162 @@ +/* + * Copyright 2013 Rob Kendrick + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** \file + * Trivial bloom filter */ + +#include +#include +#include "utils/bloom.h" + +/** + * Hash a string, returning a 32bit value. The hash algorithm used is + * Fowler Noll Vo - a very fast and simple hash, ideal for short strings. + * See http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash for more details. + * + * \param datum The string to hash. + * \param len size_t of data length. + * \return The calculated hash value for the datum. + */ + +static inline uint32_t fnv(const char *datum, size_t len) +{ + uint32_t z = 0x811c9dc5; + + if (datum == NULL) + return 0; + + while (len--) { + z *= 0x01000193; + z ^= *datum++; + } + + return z; +} + +struct bloom_filter { + size_t size; + uint32_t items; + uint8_t filter[]; +}; + +struct bloom_filter *bloom_create(size_t size) +{ + struct bloom_filter *r = calloc(sizeof(*r) + size, 1); + + if (r == NULL) + return NULL; + + r->size = size; + + return r; +} + +void bloom_destroy(struct bloom_filter *b) +{ + free(b); +} + +void bloom_insert_str(struct bloom_filter *b, const char *s, size_t z) +{ + uint32_t hash = fnv(s, z); + bloom_insert_hash(b, hash); +} + +void bloom_insert_hash(struct bloom_filter *b, uint32_t hash) +{ + int index = hash % b->size; + int bit = hash % 8; + + b->filter[index] |= (1 << bit); + b->items++; +} + +bool bloom_search_str(struct bloom_filter *b, const char *s, size_t z) +{ + uint32_t hash = fnv(s, z); + return bloom_search_hash(b, hash); +} + +bool bloom_search_hash(struct bloom_filter *b, uint32_t hash) +{ + int index = hash % b->size; + int bit = hash % 8; + + return (b->filter[index] & (1 << bit)) != 0; +} + +uint32_t bloom_items(struct bloom_filter *b) +{ + return b->items; +} + +#ifdef TEST_RIG + +#include +#include +#include + +int main(int argc, char *arg[]) +{ + struct bloom_filter *b = bloom_create(8192); + FILE *dict = fopen("/usr/share/dict/words", "r"); + char buf[BUFSIZ]; + int false_positives = 0, total = 0; + + for (int i = 0; i < 8192; i++) { + fscanf(dict, "%s", buf); + printf("adding %s\n", buf); + bloom_insert_str(b, buf, strlen(buf)); + } + + printf("adding NetSurf\n"); + + bloom_insert_str(b, "NetSurf", 7); + printf("checking NetSurf (should be true)\n"); + assert(bloom_search_str(b, "NetSurf", 7)); + + fseek(dict, 0, SEEK_SET); + + for (int i = 0; i < 8192; i++) { + fscanf(dict, "%s", buf); + printf("checking %s (should be true)\n", buf); + assert(bloom_search_str(b, buf, strlen(buf))); + + total++; + } + + for (int i = 0; i < 8192; i++) { + fscanf(dict, "%s", buf); + printf("checking %s (should be false)\n", buf); + if (bloom_search_str(b, buf, strlen(buf)) == true) + false_positives++; + total++; + } + + printf("false positives: %d of %d, %f%%\n", + false_positives, total, + ((float)false_positives / total) * 100); + + fclose(dict); + bloom_destroy(b); + + return 0; +} + +#endif /* TEST_RIG */ + diff --git a/utils/bloom.h b/utils/bloom.h new file mode 100644 index 000000000..4a7bd3800 --- /dev/null +++ b/utils/bloom.h @@ -0,0 +1,99 @@ +/* + * Copyright 2013 Rob Kendrick + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** \file + * Trivial bloom filter */ + +#ifndef _NETSURF_UTILS_BLOOM_H_ +#define _NETSURF_UTILS_BLOOM_H_ + +#include +#include +#include + +struct bloom_filter; + +/** + * Create a new bloom filter. + * + * \param size Size of bloom filter in bytes + * \return Handle for newly-created bloom filter, or NULL + */ +struct bloom_filter *bloom_create(size_t size); + +/** + * Destroy a previously-created bloom filter + * + * \param b Bloom filter to destroy + */ +void bloom_destroy(struct bloom_filter *b); + +/** + * Insert a string of given length (may include NULs) into the filter, + * using an internal hash function. + * + * \param b Bloom filter to add to + * \param s Pointer to data + * \param z Length of data + */ +void bloom_insert_str(struct bloom_filter *b, const char *s, size_t z); + +/** + * Insert a given hash value into the filter, should you already have + * one to hand. + * + * \param b Bloom filter to add to + * \param hash Value to add + */ +void bloom_insert_hash(struct bloom_filter *b, uint32_t hash); + +/** + * Search the filter for the given string, assuming it was added by + * bloom_insert_str(). May return false-positives. + * + * \param b Bloom filter to search + * \param s Pointer to data to search for + * \param z Length of data + * + * \return False if never added, True if it might have been. + */ +bool bloom_search_str(struct bloom_filter *b, const char *s, size_t z); + +/** + * Search the filter for the given hash value, assuming it was added by + * bloom_insert_hash(). May return false-positives. + * + * \param b Bloom filter to search + * \param hash Hash value to search for + * + * \return False if never added, True if it might have been. + */ +bool bloom_search_hash(struct bloom_filter *b, uint32_t hash); + +/** + * Find out how many items have been added to this bloom filter. This + * is useful for deciding the size of a new bloom filter should you + * need to rehash it. + * + * \param b Bloom filter to examine + * + * \return Number of items that have been added + */ +uint32_t bloom_items(struct bloom_filter *b); + +#endif