Author | Jamozed <[email protected]> |
Date | 2022-01-05 11:39:34 |
Commit | 7af21319ca0ba632626952bc3a74e1fb295537fa |
Parent | 62177455a95ed26f0b89410a23f4721918e9e882 |
map: Reimplement map using robin hood hashing
Diffstat
M | README.md | | | 4 | +++- |
M | src/map.c | | | 182 | ++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------- |
M | src/map.h | | | 22 | ++++++++++++++-------- |
M | src/test/test_map.c | | | 68 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------- |
4 files changed, 193 insertions, 83 deletions
diff --git a/README.md b/README.md index eaa56a9..09ef39c 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ intended to provide common portable extensions to the C standard library. | endian | Endianness related functions | | | error | Error reporting functions | | | fnv | FNV hashing algorithms | | +| map | Hashmap data structure | | | mode | Parse numeric or symbolic POSIX modes | | | optget | Parse command line options | | | rc2 | RC2 encryption algorithm | RFC 2268 | @@ -25,7 +26,8 @@ intended to provide common portable extensions to the C standard library. **libutil** is being developed on x86-64 Linux, and some components may depend on POSIX provided functionality. -**libutil** source files are intended to be built directly into your project. +**libutil** source files are intended to be built directly into projects, rather +than being compiled seperately and linked. ### Dependencies diff --git a/src/map.c b/src/map.c index 3fc9bb5..afbb733 100644 --- a/src/map.c +++ b/src/map.c @@ -1,4 +1,4 @@ -// util/map.c, version 0.0.0 +// util/map.c, version 0.1.0 // Map utility source file from libutil // Copyright (C) 2021, Jakob Wakeling // All rights reserved. @@ -30,111 +30,159 @@ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. */ +/* + This file uses the currently non-standard 'typeof' operator. Its use is + considered acceptable because it is supported by both GCC and Clang, and + POSIX extensions are used here anyway. Additionally, it is expected that the + 'typeof' operator will become standard in C23. +*/ + #include "alloc.h" -#include "fnv.h" #include "map.h" #include "util.h" -#include <malloc.h> - +#include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#define LOAD_FACTOR 0.75 +#define LOAD_FACTOR 0.90 + +UINT map_initial_capacity = 64; static void map_resize(map *m); +static u64 hash(const char *s, UINT l); /* Initialise a map. */ -map map_init(UINT el, void (*free)(void *)) { - return (map){ NULL, NULL, 0, 0, el, free }; +map map_init(void (*free)(void *)) { + return (map){ NULL, 0, 0, free }; } -/* Unititialise a map. */ +/* Uninitialise a map. */ void map_free(map *m) { - for (map_ent *e = m->n, *n; e; e = n) { - n = e->n; if (m->free) { m->free(e->v); } free(e); + for (UINT i = 0; i < m->ac; i += 1) { + if (m->a[i].h == 0) { continue; } free(m->a[i].k); + if (m->free != NULL) { m->free(m->a[i].v); } } - free(m->a); + free(m->a); *m = (map){ NULL, 0, 0, NULL }; } -/* Insert a pointer into a map. */ -void *map_insert(map *m, const char *k, void *v) { +#define SWAP(a, b) { register typeof (a) t = a; a = b; b = t; } +#define DIB(i) ((i + m->ac - (m->a[i].h % m->ac)) % m->ac) + +/* Insert a key-value pair into a map. The key is duplicated. */ +void map_insert(map *m, char *k, void *v) { if (m->ac == 0 || m->al >= ((f64)m->ac * LOAD_FACTOR)) { map_resize(m); } - UINT index = fnv1a64(k, strlen(k)) % m->ac; void *old = map_remove(m, k); - - /* Allocate and define the entry */ - map_ent *e = xcalloc(1, sizeof (*e)); - // map_debug(m); - e->k = (char *)k; e->v = v; e->n = m->n; m->n = e; + UINT h = hash(k, strlen(k)), i = h % m->ac; k = strdup(k); - /* Insert the entry at begining of the buckets chain */ - // printf("m->a: %p\n", m->a); - // printf("%zu\n", malloc_usable_size(m->a + (index * m->el))); - memcpy(&e->c, m->a + (index * m->el), m->el); - memcpy(m->a + (index * m->el), &e, m->el); m->al += 1; - - return old; + for (UINT dist = 0;; i = (i + 1) % m->ac, dist += 1) { + if (m->a[i].h == 0) { + /* If an empty bucket is found, insert here */ + m->a[i] = (typeof (*m->a)){ h, k, v }; + m->al += 1; return; + } + + /* Calculate tsid, the DIB of the item at the current index */ + UINT tsid = (i + m->ac - (m->a[i].h % m->ac)) % m->ac; + + if (dist > tsid) { + SWAP(m->a[i].h, h); + SWAP(m->a[i].k, k); + SWAP(m->a[i].v, v); + + dist = tsid; + } + } } -/* Lookup a pointer from a map. */ -void *map_lookup(map *m, const char *k) { - UINT index = fnv1a64(k, strlen(k)) % m->ac; +/* Lookup the value associated with a key from a map. */ +void *map_lookup(map *m, char *k) { + UINT h = hash(k, strlen(k)), i = h % m->ac; - map_ent *e; memcpy(&e, m->a + (index * m->el), m->el); - for (; e; e = e->c) { if (!strcmp(e->k, k)) { return e->v; } } - - return NULL; + for (UINT dist = 0;; i = (i + 1) % m->ac, dist += 1) { + if (m->a[i].h == 0) { return NULL; } + + if (dist > DIB(i)) { return NULL; } + if ((m->a[i].h == h) && (strcmp(m->a[i].k, k) == 0)) { + return m->a[i].v; + } + } } -/* Remove a pointer from the top of a map. */ -void *map_remove(map *m, const char *k) { - return NULL; /* TODO, this needs a 'p' pointer in the entry struct */ +/* Remove a key-value pair from a map. */ +void *map_remove(map *m, char *k) { + UINT h = hash(k, strlen(k)), i = h % m->ac; + + for (UINT dist = 0;; i = (i + 1) % m->ac, dist += 1) { + if (m->a[i].h == 0) { return NULL; } + + if (dist > DIB(i)) { return NULL; } + if ((m->a[i].h == h) && (strcmp(m->a[i].k, k) == 0)) { + /* If the element to be removed is found, then deallocate it */ + if (m->free != NULL) { m->free(m->a[i].v); } free(m->a[i].k); + m->a[i] = (typeof (*m->a)){ 0, NULL, NULL }; m->al -= 1; + + /* */ + for (UINT j = (i + 1) % m->ac;; i = j, j = (j + 1) % m->ac) { + if (m->a[j].h == 0 || DIB(j) == 0) { break; } + + SWAP(m->a[i].h, m->a[j].h); + SWAP(m->a[i].k, m->a[j].k); + SWAP(m->a[i].v, m->a[j].v); + } + + /* + TODO I am unsure if I want to have this procedure return the + removed value or simply an acknowledgement of its removal + */ + return (void *)true; + } + } } -/* Print a basic representation of the map to stdout. */ +/* Print a basic representation of a map to stdout. */ void map_print(map *m) { - for (map_ent *e = m->n; e; e = e->n) { - printf("%s -> %s\n", e->k, (char *)e->v); + for (UINT i = 0; i < m->ac; i += 1) if (m->a[i].h != 0) { + printf("%s -> %s\n", m->a[i].k, (char *)m->a[i].v); } } -/* Print a debug representation of the map to stdout. */ -/* FIXME for some reason printf leaves junk in my calloc (m->a[33]) */ +/* Print a debug representation of a map to stdout. */ void map_debug(map *m) { - for (UINT i = 0; i != m->ac; i += 1) { - map_ent *e; memcpy(&e, m->a + (i * m->el), m->el); - - // printf("%zu: ", i); - for (; e; e = e->c) { printf("%s -> %s, ", e->k, (char *)e->v); } - // printf("\n"); - } printf("\n"); + for (UINT i = 0; i < m->ac; i += 1) { + if (m->a[i].h == 0) { printf("[%zu] %lu\n", i, m->a[i].h); } + else printf( + "[%zu] %lu, %s -> %s, DIB: %zu\n", + i, m->a[i].h, m->a[i].k, (char *)m->a[i].v, DIB(i) + ); + } } /* Double the number of buckets in a map. */ static void map_resize(map *m) { - if (m->ac == 0) { m->ac = 256; } else { m->ac *= 2; } - - /* If the map is empty, simply resize it without rehashing */ - if (m->al == 0) { - // printf("%zu * %zu = %zu; %zu\n", m->ac, m->el, m->ac * m->el, m->ac * 8 * m->el); fflush(stdout); - free(m->a); m->a = xcalloc(m->ac * 8, m->el); - // printf("m->a: %p\n", m->a); - // printf("%p: %zu\n", m->a, malloc_usable_size(m->a)); - // printf("%p: %zu\n", m->a + 1696, malloc_usable_size(m->a + 1696)); - // printf("%zu\n", ((m->a + 1696) - m->a)); - fflush(stdout); + /* If the map is empty, simply allocate it without rehashing */ + if (m->ac == 0) { + m->ac = map_initial_capacity; + m->a = xcalloc(m->ac, sizeof (*m->a)); return; } + /* Otherwise rehash every element into a new resized map */ - else { - // printf("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n"); fflush(stdout); - - map old = *m; - - m->a = xcalloc(m->ac * 8, m->el); m->n = NULL; m->al = 0; - for (map_ent *e = old.n; e; e = e->n) { map_insert(m, e->k, e->v); } + map old = *m; m->ac *= 2; m->a = xcalloc(m->ac, sizeof (*m->a)); m->al = 0; + + for (UINT i = 0; i < old.ac; i += 1) { + if (old.a[i].h == 0) { continue; } - map_free(&old); + map_insert(m, old.a[i].k, old.a[i].v); + free(old.a[i].k); } + + free(old.a); +} + +/* Compute the hash of some data. Will not return 0. */ +static u64 hash(const char *dat, UINT len) { + register u64 fnv = 0xCBF29CE484222325; + for (; len; len -= 1, dat += 1) { fnv ^= *dat; fnv *= 0x00000100000001B3; } + fnv |= fnv == 0; return fnv; } diff --git a/src/map.h b/src/map.h index 59c37d5..652aea4 100644 --- a/src/map.h +++ b/src/map.h @@ -1,4 +1,4 @@ -// util/map.h, version 0.0.0 +// util/map.h, version 0.1.0 // Map utility header file from libutil // Copyright (C) 2021, Jakob Wakeling // All rights reserved. @@ -35,17 +35,23 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. #include "util.h" -#define MAP_INSERT(m, k, v) map_insert(m, k, (void *)(UINT)(e)) +typedef struct { + struct { u64 h; char *k; void *v; } *a; + UINT al, ac; void (*free)(void *); +} map; -typedef struct map_ent { char *k; void *v; struct map_ent *c, *p, *n; } map_ent; -typedef struct { map_ent **a, *n; UINT al, ac, el; void (*free)(void *); } map; +// #define MAP_INIT(type, free) map_init(sizeof (type), (free)) +// #define MAP_INSERT(m, k, v) map_insert(m, k, (void *)(UINT)(e)) -extern map map_init(UINT el, void (*free)(void *)); +/* This should be used for debugging or testing only. */ +extern UINT map_initial_capacity; + +extern map map_init(void (*free)(void *)); extern void map_free(map *m); -extern void *map_insert(map *m, const char *k, void *v); -extern void *map_lookup(map *m, const char *k); -extern void *map_remove(map *m, const char *k); +extern void map_insert(map *m, char *k, void *v); +extern void *map_lookup(map *m, char *k); +extern void *map_remove(map *m, char *k); extern void map_print(map *m); extern void map_debug(map *m); diff --git a/src/test/test_map.c b/src/test/test_map.c index adbbb45..9a74c77 100644 --- a/src/test/test_map.c +++ b/src/test/test_map.c @@ -26,19 +26,73 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. int main(void) { { - map m = map_init(sizeof (void *), NULL); + map_initial_capacity = 4; + map m = map_init(NULL); map_insert(&m, "key0", "value0"); - // map_insert(&m, "key1", "value1"); + map_insert(&m, "key4", "value4"); + map_insert(&m, "key3", "value3"); - // ASSERT("T000 INSERT AND LOOKUP STRING 0", - // strcmp(map_lookup(&m, "key0"), "value0") == 0); - // ASSERT("T001 INSERT AND LOOKUP STRING 1", - // strcmp(map_lookup(&m, "key1"), "value1") == 0); + ASSERT("T000 INSERT AND LOOKUP STRING 0", + strcmp(map_lookup(&m, "key0"), "value0") == 0); + ASSERT("T001 INSERT AND LOOKUP STRING 1", + strcmp(map_lookup(&m, "key4"), "value4") == 0); + ASSERT("T002 INSERT AND LOOKUP STRING 2", + strcmp(map_lookup(&m, "key3"), "value3") == 0); + + ASSERT("T003 LOOKUP NONEXISTENT STRING", + map_lookup(&m, "key1") == NULL); + + map_remove(&m, "key4"); + + ASSERT("T004 REMOVE AND LOOKUP STRING", + map_lookup(&m, "key4") == NULL); + + ASSERT("T006 LOOKUP STRING AFTER REMOVE 0", + strcmp(map_lookup(&m, "key0"), "value0") == 0); + ASSERT("T007 LOOKUP STRING AFTER REMOVE 1", + strcmp(map_lookup(&m, "key3"), "value3") == 0); + + ASSERT("T008 REMOVE NONEXISTENT STRING", + map_remove(&m, "key4") == NULL); + + map_free(&m); + } + + { + map_initial_capacity = 4; + map m = map_init(NULL); + + map_insert(&m, "key0", "value0"); + map_insert(&m, "key3", "value3"); + map_insert(&m, "key4", "value4"); + + ASSERT("T100 INSERT AND LOOKUP STRING 0", + strcmp(map_lookup(&m, "key0"), "value0") == 0); + ASSERT("T101 INSERT AND LOOKUP STRING 1", + strcmp(map_lookup(&m, "key3"), "value3") == 0); + ASSERT("T102 INSERT AND LOOKUP STRING 2", + strcmp(map_lookup(&m, "key4"), "value4") == 0); + + ASSERT("T103 LOOKUP NONEXISTENT STRING", + map_lookup(&m, "key1") == NULL); + + map_remove(&m, "key4"); + + ASSERT("T104 REMOVE AND LOOKUP STRING", + map_lookup(&m, "key4") == NULL); + + ASSERT("T106 LOOKUP STRING AFTER REMOVE 0", + strcmp(map_lookup(&m, "key0"), "value0") == 0); + ASSERT("T107 LOOKUP STRING AFTER REMOVE 1", + strcmp(map_lookup(&m, "key3"), "value3") == 0); + + ASSERT("T108 REMOVE NONEXISTENT STRING", + map_remove(&m, "key4") == NULL); map_free(&m); } - // printf("%d of %d tests passed\n", testspassed, testsrun); + printf("%d of %d tests passed\n", testspassed, testsrun); return testsfailed; }