Painfully trivial bloom filter implementation to experiment with optimising urldb lookups
This commit is contained in:
parent
023c014ac4
commit
7d60132816
|
@ -0,0 +1,162 @@
|
|||
/*
|
||||
* Copyright 2013 Rob Kendrick <rjek@netsurf-browser.org>
|
||||
*
|
||||
* This file is part of NetSurf, http://www.netsurf-browser.org/
|
||||
*
|
||||
* NetSurf is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; version 2 of the License.
|
||||
*
|
||||
* NetSurf is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* Trivial bloom filter */
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "utils/bloom.h"
|
||||
|
||||
/**
|
||||
* Hash a string, returning a 32bit value. The hash algorithm used is
|
||||
* Fowler Noll Vo - a very fast and simple hash, ideal for short strings.
|
||||
* See http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash for more details.
|
||||
*
|
||||
* \param datum The string to hash.
|
||||
* \param len size_t of data length.
|
||||
* \return The calculated hash value for the datum.
|
||||
*/
|
||||
|
||||
static inline uint32_t fnv(const char *datum, size_t len)
|
||||
{
|
||||
uint32_t z = 0x811c9dc5;
|
||||
|
||||
if (datum == NULL)
|
||||
return 0;
|
||||
|
||||
while (len--) {
|
||||
z *= 0x01000193;
|
||||
z ^= *datum++;
|
||||
}
|
||||
|
||||
return z;
|
||||
}
|
||||
|
||||
struct bloom_filter {
|
||||
size_t size;
|
||||
uint32_t items;
|
||||
uint8_t filter[];
|
||||
};
|
||||
|
||||
struct bloom_filter *bloom_create(size_t size)
|
||||
{
|
||||
struct bloom_filter *r = calloc(sizeof(*r) + size, 1);
|
||||
|
||||
if (r == NULL)
|
||||
return NULL;
|
||||
|
||||
r->size = size;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void bloom_destroy(struct bloom_filter *b)
|
||||
{
|
||||
free(b);
|
||||
}
|
||||
|
||||
void bloom_insert_str(struct bloom_filter *b, const char *s, size_t z)
|
||||
{
|
||||
uint32_t hash = fnv(s, z);
|
||||
bloom_insert_hash(b, hash);
|
||||
}
|
||||
|
||||
void bloom_insert_hash(struct bloom_filter *b, uint32_t hash)
|
||||
{
|
||||
int index = hash % b->size;
|
||||
int bit = hash % 8;
|
||||
|
||||
b->filter[index] |= (1 << bit);
|
||||
b->items++;
|
||||
}
|
||||
|
||||
bool bloom_search_str(struct bloom_filter *b, const char *s, size_t z)
|
||||
{
|
||||
uint32_t hash = fnv(s, z);
|
||||
return bloom_search_hash(b, hash);
|
||||
}
|
||||
|
||||
bool bloom_search_hash(struct bloom_filter *b, uint32_t hash)
|
||||
{
|
||||
int index = hash % b->size;
|
||||
int bit = hash % 8;
|
||||
|
||||
return (b->filter[index] & (1 << bit)) != 0;
|
||||
}
|
||||
|
||||
uint32_t bloom_items(struct bloom_filter *b)
|
||||
{
|
||||
return b->items;
|
||||
}
|
||||
|
||||
#ifdef TEST_RIG
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
|
||||
int main(int argc, char *arg[])
|
||||
{
|
||||
struct bloom_filter *b = bloom_create(8192);
|
||||
FILE *dict = fopen("/usr/share/dict/words", "r");
|
||||
char buf[BUFSIZ];
|
||||
int false_positives = 0, total = 0;
|
||||
|
||||
for (int i = 0; i < 8192; i++) {
|
||||
fscanf(dict, "%s", buf);
|
||||
printf("adding %s\n", buf);
|
||||
bloom_insert_str(b, buf, strlen(buf));
|
||||
}
|
||||
|
||||
printf("adding NetSurf\n");
|
||||
|
||||
bloom_insert_str(b, "NetSurf", 7);
|
||||
printf("checking NetSurf (should be true)\n");
|
||||
assert(bloom_search_str(b, "NetSurf", 7));
|
||||
|
||||
fseek(dict, 0, SEEK_SET);
|
||||
|
||||
for (int i = 0; i < 8192; i++) {
|
||||
fscanf(dict, "%s", buf);
|
||||
printf("checking %s (should be true)\n", buf);
|
||||
assert(bloom_search_str(b, buf, strlen(buf)));
|
||||
|
||||
total++;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 8192; i++) {
|
||||
fscanf(dict, "%s", buf);
|
||||
printf("checking %s (should be false)\n", buf);
|
||||
if (bloom_search_str(b, buf, strlen(buf)) == true)
|
||||
false_positives++;
|
||||
total++;
|
||||
}
|
||||
|
||||
printf("false positives: %d of %d, %f%%\n",
|
||||
false_positives, total,
|
||||
((float)false_positives / total) * 100);
|
||||
|
||||
fclose(dict);
|
||||
bloom_destroy(b);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* TEST_RIG */
|
||||
|
|
@ -0,0 +1,99 @@
|
|||
/*
|
||||
* Copyright 2013 Rob Kendrick <rjek@netsurf-browser.org>
|
||||
*
|
||||
* This file is part of NetSurf, http://www.netsurf-browser.org/
|
||||
*
|
||||
* NetSurf is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; version 2 of the License.
|
||||
*
|
||||
* NetSurf is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/** \file
|
||||
* Trivial bloom filter */
|
||||
|
||||
#ifndef _NETSURF_UTILS_BLOOM_H_
|
||||
#define _NETSURF_UTILS_BLOOM_H_
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
struct bloom_filter;
|
||||
|
||||
/**
|
||||
* Create a new bloom filter.
|
||||
*
|
||||
* \param size Size of bloom filter in bytes
|
||||
* \return Handle for newly-created bloom filter, or NULL
|
||||
*/
|
||||
struct bloom_filter *bloom_create(size_t size);
|
||||
|
||||
/**
|
||||
* Destroy a previously-created bloom filter
|
||||
*
|
||||
* \param b Bloom filter to destroy
|
||||
*/
|
||||
void bloom_destroy(struct bloom_filter *b);
|
||||
|
||||
/**
|
||||
* Insert a string of given length (may include NULs) into the filter,
|
||||
* using an internal hash function.
|
||||
*
|
||||
* \param b Bloom filter to add to
|
||||
* \param s Pointer to data
|
||||
* \param z Length of data
|
||||
*/
|
||||
void bloom_insert_str(struct bloom_filter *b, const char *s, size_t z);
|
||||
|
||||
/**
|
||||
* Insert a given hash value into the filter, should you already have
|
||||
* one to hand.
|
||||
*
|
||||
* \param b Bloom filter to add to
|
||||
* \param hash Value to add
|
||||
*/
|
||||
void bloom_insert_hash(struct bloom_filter *b, uint32_t hash);
|
||||
|
||||
/**
|
||||
* Search the filter for the given string, assuming it was added by
|
||||
* bloom_insert_str(). May return false-positives.
|
||||
*
|
||||
* \param b Bloom filter to search
|
||||
* \param s Pointer to data to search for
|
||||
* \param z Length of data
|
||||
*
|
||||
* \return False if never added, True if it might have been.
|
||||
*/
|
||||
bool bloom_search_str(struct bloom_filter *b, const char *s, size_t z);
|
||||
|
||||
/**
|
||||
* Search the filter for the given hash value, assuming it was added by
|
||||
* bloom_insert_hash(). May return false-positives.
|
||||
*
|
||||
* \param b Bloom filter to search
|
||||
* \param hash Hash value to search for
|
||||
*
|
||||
* \return False if never added, True if it might have been.
|
||||
*/
|
||||
bool bloom_search_hash(struct bloom_filter *b, uint32_t hash);
|
||||
|
||||
/**
|
||||
* Find out how many items have been added to this bloom filter. This
|
||||
* is useful for deciding the size of a new bloom filter should you
|
||||
* need to rehash it.
|
||||
*
|
||||
* \param b Bloom filter to examine
|
||||
*
|
||||
* \return Number of items that have been added
|
||||
*/
|
||||
uint32_t bloom_items(struct bloom_filter *b);
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue