libutil

C Utility Library
git clone http://git.omkov.net/libutil
Log | Tree | Refs | README | LICENCE | Download

AuthorJamozed <[email protected]>
Date2022-01-05 11:39:34
Commit7af21319ca0ba632626952bc3a74e1fb295537fa
Parent62177455a95ed26f0b89410a23f4721918e9e882

map: Reimplement map using robin hood hashing

Diffstat

M README.md | 4 +++-
M src/map.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
M src/map.h | 22 ++++++++++++++--------
M src/test/test_map.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------

4 files changed, 193 insertions, 83 deletions

diff --git a/README.md b/README.md
index eaa56a9..09ef39c 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ intended to provide common portable extensions to the C standard library.
 | endian           | Endianness related functions             |          |
 | error            | Error reporting functions                |          |
 | fnv              | FNV hashing algorithms                   |          |
+| map              | Hashmap data structure                   |          |
 | mode             | Parse numeric or symbolic POSIX modes    |          |
 | optget           | Parse command line options               |          |
 | rc2              | RC2 encryption algorithm                 | RFC 2268 |
@@ -25,7 +26,8 @@ intended to provide common portable extensions to the C standard library.
 **libutil** is being developed on x86-64 Linux, and some components may depend
 on POSIX provided functionality.
 
-**libutil** source files are intended to be built directly into your project.
+**libutil** source files are intended to be built directly into projects, rather
+than being compiled seperately and linked.
 
 ### Dependencies
 
diff --git a/src/map.c b/src/map.c
index 3fc9bb5..afbb733 100644
--- a/src/map.c
+++ b/src/map.c
@@ -1,4 +1,4 @@
-// util/map.c, version 0.0.0
+// util/map.c, version 0.1.0
 // Map utility source file from libutil
 // Copyright (C) 2021, Jakob Wakeling
 // All rights reserved.
@@ -30,111 +30,159 @@ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
 */
 
+/*
+	This file uses the currently non-standard 'typeof' operator. Its use is
+	considered acceptable because it is supported by both GCC and Clang, and
+	POSIX extensions are used here anyway. Additionally, it is expected that the
+	'typeof' operator will become standard in C23.
+*/
+
 #include "alloc.h"
-#include "fnv.h"
 #include "map.h"
 #include "util.h"
 
-#include <malloc.h>
-
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#define LOAD_FACTOR 0.75
+#define LOAD_FACTOR 0.90
+
+UINT map_initial_capacity = 64;
 
 static void map_resize(map *m);
+static u64  hash(const char *s, UINT l);
 
 /* Initialise a map. */
-map map_init(UINT el, void (*free)(void *)) {
-	return (map){ NULL, NULL, 0, 0, el, free };
+map map_init(void (*free)(void *)) {
+	return (map){ NULL, 0, 0, free };
 }
 
-/* Unititialise a map. */
+/* Uninitialise a map. */
 void map_free(map *m) {
-	for (map_ent *e = m->n, *n; e; e = n) {
-		n = e->n; if (m->free) { m->free(e->v); } free(e);
+	for (UINT i = 0; i < m->ac; i += 1) {
+		if (m->a[i].h == 0) { continue; } free(m->a[i].k);
+		if (m->free != NULL) { m->free(m->a[i].v); }
 	}
 
-	free(m->a);
+	free(m->a); *m = (map){ NULL, 0, 0, NULL };
 }
 
-/* Insert a pointer into a map. */
-void *map_insert(map *m, const char *k, void *v) {
+#define SWAP(a, b) { register typeof (a) t = a; a = b; b = t; }
+#define DIB(i) ((i + m->ac - (m->a[i].h % m->ac)) % m->ac)
+
+/* Insert a key-value pair into a map. The key is duplicated. */
+void map_insert(map *m, char *k, void *v) {
 	if (m->ac == 0 || m->al >= ((f64)m->ac * LOAD_FACTOR)) { map_resize(m); }
-	UINT index = fnv1a64(k, strlen(k)) % m->ac; void *old = map_remove(m, k);
-	
-	/* Allocate and define the entry */
-	map_ent *e = xcalloc(1, sizeof (*e));
-	// map_debug(m);
-	e->k = (char *)k; e->v = v; e->n = m->n; m->n = e;
+	UINT h = hash(k, strlen(k)), i = h % m->ac; k = strdup(k);
 
-	/* Insert the entry at begining of the buckets chain */
-	// printf("m->a: %p\n", m->a);
-	// printf("%zu\n", malloc_usable_size(m->a + (index * m->el)));
-	memcpy(&e->c, m->a + (index * m->el), m->el);
-	memcpy(m->a + (index * m->el), &e, m->el); m->al += 1;
-	
-	return old;
+	for (UINT dist = 0;; i = (i + 1) % m->ac, dist += 1) {
+		if (m->a[i].h == 0) {
+			/* If an empty bucket is found, insert here */
+			m->a[i] = (typeof (*m->a)){ h, k, v };
+			m->al += 1; return;
+		}
+		
+		/* Calculate tsid, the DIB of the item at the current index */
+		UINT tsid = (i + m->ac - (m->a[i].h % m->ac)) % m->ac;
+		
+		if (dist > tsid) {
+			SWAP(m->a[i].h, h);
+			SWAP(m->a[i].k, k);
+			SWAP(m->a[i].v, v);
+			
+			dist = tsid;
+		}
+	}
 }
 
-/* Lookup a pointer from a map. */
-void *map_lookup(map *m, const char *k) {
-	UINT index = fnv1a64(k, strlen(k)) % m->ac;
+/* Lookup the value associated with a key from a map. */
+void *map_lookup(map *m, char *k) {
+	UINT h = hash(k, strlen(k)), i = h % m->ac;
 
-	map_ent *e; memcpy(&e, m->a + (index * m->el), m->el);
-	for (; e; e = e->c) { if (!strcmp(e->k, k)) { return e->v; } }
-	
-	return NULL;
+	for (UINT dist = 0;; i = (i + 1) % m->ac, dist += 1) {
+		if (m->a[i].h == 0) { return NULL; }
+		
+		if (dist > DIB(i)) { return NULL; }
+		if ((m->a[i].h == h) && (strcmp(m->a[i].k, k) == 0)) {
+			return m->a[i].v;
+		}
+	}
 }
 
-/* Remove a pointer from the top of a map. */
-void *map_remove(map *m, const char *k) {
-	return NULL; /* TODO, this needs a 'p' pointer in the entry struct */
+/* Remove a key-value pair from a map. */
+void *map_remove(map *m, char *k) {
+	UINT h = hash(k, strlen(k)), i = h % m->ac;
+	
+	for (UINT dist = 0;; i = (i + 1) % m->ac, dist += 1) {
+		if (m->a[i].h == 0) { return NULL; }
+		
+		if (dist > DIB(i)) { return NULL; }
+		if ((m->a[i].h == h) && (strcmp(m->a[i].k, k) == 0)) {
+			/* If the element to be removed is found, then deallocate it */
+			if (m->free != NULL) { m->free(m->a[i].v); } free(m->a[i].k);
+			m->a[i] = (typeof (*m->a)){ 0, NULL, NULL }; m->al -= 1;
+			
+			/*  */
+			for (UINT j = (i + 1) % m->ac;; i = j, j = (j + 1) % m->ac) {
+				if (m->a[j].h == 0 || DIB(j) == 0) { break; }
+				
+				SWAP(m->a[i].h, m->a[j].h);
+				SWAP(m->a[i].k, m->a[j].k);
+				SWAP(m->a[i].v, m->a[j].v);
+			}
+			
+			/*
+				TODO I am unsure if I want to have this procedure return the
+				removed value or simply an acknowledgement of its removal
+			*/
+			return (void *)true;
+		}
+	}
 }
 
-/* Print a basic representation of the map to stdout. */
+/* Print a basic representation of a map to stdout. */
 void map_print(map *m) {
-	for (map_ent *e = m->n; e; e = e->n) {
-		printf("%s -> %s\n", e->k, (char *)e->v);
+	for (UINT i = 0; i < m->ac; i += 1) if (m->a[i].h != 0) {
+		printf("%s -> %s\n", m->a[i].k, (char *)m->a[i].v);
 	}
 }
 
-/* Print a debug representation of the map to stdout. */
-/* FIXME for some reason printf leaves junk in my calloc (m->a[33]) */
+/* Print a debug representation of a map to stdout. */
 void map_debug(map *m) {
-	for (UINT i = 0; i != m->ac; i += 1) {
-		map_ent *e; memcpy(&e, m->a + (i * m->el), m->el);
-		
-		// printf("%zu: ", i);
-		for (; e; e = e->c) { printf("%s -> %s, ", e->k, (char *)e->v); }
-		// printf("\n");
-	} printf("\n");
+	for (UINT i = 0; i < m->ac; i += 1) {
+		if (m->a[i].h == 0) { printf("[%zu] %lu\n", i, m->a[i].h); }
+		else printf(
+			"[%zu] %lu, %s -> %s, DIB: %zu\n",
+			i, m->a[i].h, m->a[i].k, (char *)m->a[i].v, DIB(i)
+		);
+	}
 }
 
 /* Double the number of buckets in a map. */
 static void map_resize(map *m) {
-	if (m->ac == 0) { m->ac = 256; } else { m->ac *= 2; }
-	
-	/* If the map is empty, simply resize it without rehashing */
-	if (m->al == 0) {
-		// printf("%zu * %zu = %zu; %zu\n", m->ac, m->el, m->ac * m->el, m->ac * 8 * m->el); fflush(stdout);
-		free(m->a); m->a = xcalloc(m->ac * 8, m->el);
-		// printf("m->a: %p\n", m->a);
-		// printf("%p: %zu\n", m->a, malloc_usable_size(m->a));
-		// printf("%p: %zu\n", m->a + 1696, malloc_usable_size(m->a + 1696));
-		// printf("%zu\n", ((m->a + 1696) - m->a));
-		fflush(stdout);
+	/* If the map is empty, simply allocate it without rehashing */
+	if (m->ac == 0) {
+		m->ac = map_initial_capacity;
+		m->a = xcalloc(m->ac, sizeof (*m->a)); return;
 	}
+	
 	/* Otherwise rehash every element into a new resized map */
-	else {
-		// printf("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n"); fflush(stdout);
-		
-		map old = *m;
-		
-		m->a = xcalloc(m->ac * 8, m->el); m->n = NULL; m->al = 0;
-		for (map_ent *e = old.n; e; e = e->n) { map_insert(m, e->k, e->v); }
+	map old = *m; m->ac *= 2; m->a = xcalloc(m->ac, sizeof (*m->a)); m->al = 0;
+	
+	for (UINT i = 0; i < old.ac; i += 1) {
+		if (old.a[i].h == 0) { continue; }
 
-		map_free(&old);
+		map_insert(m, old.a[i].k, old.a[i].v);
+		free(old.a[i].k);
 	}
+	
+	free(old.a);
+}
+
+/* Compute the hash of some data. Will not return 0. */
+static u64 hash(const char *dat, UINT len) {
+	register u64 fnv = 0xCBF29CE484222325;
+	for (; len; len -= 1, dat += 1) { fnv ^= *dat; fnv *= 0x00000100000001B3; }
+	fnv |= fnv == 0; return fnv;
 }
diff --git a/src/map.h b/src/map.h
index 59c37d5..652aea4 100644
--- a/src/map.h
+++ b/src/map.h
@@ -1,4 +1,4 @@
-// util/map.h, version 0.0.0
+// util/map.h, version 0.1.0
 // Map utility header file from libutil
 // Copyright (C) 2021, Jakob Wakeling
 // All rights reserved.
@@ -35,17 +35,23 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
 
 #include "util.h"
 
-#define MAP_INSERT(m, k, v) map_insert(m, k, (void *)(UINT)(e))
+typedef struct {
+	struct { u64 h; char *k; void *v; } *a;
+	UINT al, ac; void (*free)(void *);
+} map;
 
-typedef struct map_ent { char *k; void *v; struct map_ent *c, *p, *n; } map_ent;
-typedef struct { map_ent **a, *n; UINT al, ac, el; void (*free)(void *); } map;
+// #define MAP_INIT(type, free) map_init(sizeof (type), (free))
+// #define MAP_INSERT(m, k, v) map_insert(m, k, (void *)(UINT)(e))
 
-extern map  map_init(UINT el, void (*free)(void *));
+/* This should be used for debugging or testing only. */
+extern UINT map_initial_capacity;
+
+extern map  map_init(void (*free)(void *));
 extern void map_free(map *m);
 
-extern void *map_insert(map *m, const char *k, void *v);
-extern void *map_lookup(map *m, const char *k);
-extern void *map_remove(map *m, const char *k);
+extern void  map_insert(map *m, char *k, void *v);
+extern void *map_lookup(map *m, char *k);
+extern void *map_remove(map *m, char *k);
 
 extern void map_print(map *m);
 extern void map_debug(map *m);
diff --git a/src/test/test_map.c b/src/test/test_map.c
index adbbb45..9a74c77 100644
--- a/src/test/test_map.c
+++ b/src/test/test_map.c
@@ -26,19 +26,73 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
 
 int main(void) {
 	{
-		map m = map_init(sizeof (void *), NULL);
+		map_initial_capacity = 4;
+		map m = map_init(NULL);
 
 		map_insert(&m, "key0", "value0");
-		// map_insert(&m, "key1", "value1");
+		map_insert(&m, "key4", "value4");
+		map_insert(&m, "key3", "value3");
 
-		// ASSERT("T000 INSERT AND LOOKUP STRING 0",
-		// 		strcmp(map_lookup(&m, "key0"), "value0") == 0);
-		// ASSERT("T001 INSERT AND LOOKUP STRING 1",
-		// 		strcmp(map_lookup(&m, "key1"), "value1") == 0);
+		ASSERT("T000 INSERT AND LOOKUP STRING 0",
+				strcmp(map_lookup(&m, "key0"), "value0") == 0);
+		ASSERT("T001 INSERT AND LOOKUP STRING 1",
+				strcmp(map_lookup(&m, "key4"), "value4") == 0);
+		ASSERT("T002 INSERT AND LOOKUP STRING 2",
+				strcmp(map_lookup(&m, "key3"), "value3") == 0);
+		
+		ASSERT("T003 LOOKUP NONEXISTENT STRING",
+				map_lookup(&m, "key1") == NULL);
+		
+		map_remove(&m, "key4");
+		
+		ASSERT("T004 REMOVE AND LOOKUP STRING",
+				map_lookup(&m, "key4") == NULL);
+		
+		ASSERT("T006 LOOKUP STRING AFTER REMOVE 0",
+				strcmp(map_lookup(&m, "key0"), "value0") == 0);
+		ASSERT("T007 LOOKUP STRING AFTER REMOVE 1",
+				strcmp(map_lookup(&m, "key3"), "value3") == 0);
+		
+		ASSERT("T008 REMOVE NONEXISTENT STRING",
+				map_remove(&m, "key4") == NULL);
+		
+		map_free(&m);
+	}
+	
+	{
+		map_initial_capacity = 4;
+		map m = map_init(NULL);
+		
+		map_insert(&m, "key0", "value0");
+		map_insert(&m, "key3", "value3");
+		map_insert(&m, "key4", "value4");
+		
+		ASSERT("T100 INSERT AND LOOKUP STRING 0",
+				strcmp(map_lookup(&m, "key0"), "value0") == 0);
+		ASSERT("T101 INSERT AND LOOKUP STRING 1",
+				strcmp(map_lookup(&m, "key3"), "value3") == 0);
+		ASSERT("T102 INSERT AND LOOKUP STRING 2",
+				strcmp(map_lookup(&m, "key4"), "value4") == 0);
+		
+		ASSERT("T103 LOOKUP NONEXISTENT STRING",
+				map_lookup(&m, "key1") == NULL);
+		
+		map_remove(&m, "key4");
+		
+		ASSERT("T104 REMOVE AND LOOKUP STRING",
+				map_lookup(&m, "key4") == NULL);
+		
+		ASSERT("T106 LOOKUP STRING AFTER REMOVE 0",
+				strcmp(map_lookup(&m, "key0"), "value0") == 0);
+		ASSERT("T107 LOOKUP STRING AFTER REMOVE 1",
+				strcmp(map_lookup(&m, "key3"), "value3") == 0);
+		
+		ASSERT("T108 REMOVE NONEXISTENT STRING",
+				map_remove(&m, "key4") == NULL);
 
 		map_free(&m);
 	}
 
-	// printf("%d of %d tests passed\n", testspassed, testsrun);
+	printf("%d of %d tests passed\n", testspassed, testsrun);
 	return testsfailed;
 }