solidc
Robust collection of general-purpose cross-platform C libraries and data structures designed for rapid and safe development in C
Loading...
Searching...
No Matches
hashset.h
Go to the documentation of this file.
1
25#ifndef __HASHSET_H__
26#define __HASHSET_H__
27
28#include <stdbool.h>
29#include <stdint.h>
30#include <stdlib.h>
31#include <string.h>
32
33#if defined(__cplusplus)
34extern "C" {
35#endif
36
37/* =========================================================================
38 * Constants
39 * ========================================================================= */
40
41#define HASHSET_DEFAULT_CAPACITY 16u
42#define HASHSET_LOAD_FACTOR 0.75
43
44/* Slot metadata tags stored in the `meta` byte array. */
45#define _HS_EMPTY 0x00u /* slot never used */
46#define _HS_DELETED 0x01u /* slot vacated (tombstone) */
47/* Values 0x02–0xFF: 7-bit fingerprint derived from hash */
48#define _HS_FINGERPRINT(h) (uint8_t)(((h) >> 7) | 0x02u)
49
50/* =========================================================================
51 * Types
52 * ========================================================================= */
53
54typedef struct {
55 uint8_t* meta; /* metadata byte per slot */
56 char* keys; /* flat key storage: slot i at keys[i*key_size] */
57 size_t capacity; /* number of slots (always power-of-two) */
58 size_t size; /* live elements */
59 size_t key_size; /* bytes per key */
60
61 uint64_t (*hash_fn)(const void* key, size_t key_size);
62
64 bool (*equals_fn)(const void* a, const void* b, size_t key_size);
65} hashset_t;
66
67/* =========================================================================
68 * Default hash (FNV-1a 64-bit)
69 * ========================================================================= */
70
71static inline uint64_t hashset_default_hash(const void* key, size_t key_size) {
72 const uint64_t FNV_OFFSET = 14695981039346656037ULL;
73 const uint64_t FNV_PRIME = 1099511628211ULL;
74 uint64_t hash = FNV_OFFSET;
75 const unsigned char* bytes = (const unsigned char*)key;
76 for (size_t i = 0; i < key_size; i++) {
77 hash ^= bytes[i];
78 hash *= FNV_PRIME;
79 }
80 return hash;
81}
82
83static inline bool hashset_default_equals(const void* a, const void* b, size_t ks) { return memcmp(a, b, ks) == 0; }
84
85/* Return slot index for probe step i starting from base. */
86static inline size_t _hs_slot(size_t base, size_t i, size_t mask) { return (base + i) & mask; }
87
88/* Pointer to key stored in slot idx. */
89static inline void* _hs_key(const hashset_t* s, size_t idx) { return (void*)(s->keys + idx * s->key_size); }
90
91static inline hashset_t* hashset_create(size_t key_size, size_t initial_capacity,
92 uint64_t (*hash_fn)(const void*, size_t),
93 bool (*equals_fn)(const void*, const void*, size_t)) {
94 if (key_size == 0) return NULL;
95
96 /* Round up to next power-of-two. */
97 size_t cap = HASHSET_DEFAULT_CAPACITY;
98 while (cap < initial_capacity) cap <<= 1;
99
100 hashset_t* set = (hashset_t*)malloc(sizeof(hashset_t));
101 if (!set) return NULL;
102
103 set->meta = (uint8_t*)calloc(cap, sizeof(uint8_t)); /* all EMPTY */
104 set->keys = (char*)malloc(cap * key_size);
105
106 if (!set->meta || !set->keys) {
107 free(set->meta);
108 free(set->keys);
109 free(set);
110 return NULL;
111 }
112
113 set->capacity = cap;
114 set->size = 0;
115 set->key_size = key_size;
116 set->hash_fn = hash_fn ? hash_fn : hashset_default_hash;
117 set->equals_fn = equals_fn ? equals_fn : hashset_default_equals;
118 return set;
119}
120
121static inline void hashset_destroy(hashset_t* set) {
122 if (!set) return;
123 free(set->meta);
124 free(set->keys);
125 free(set);
126}
127
128/* =========================================================================
129 * Lookup (contains)
130 *
131 * Probe linearly, comparing the 7-bit fingerprint first (no memcmp unless
132 * the fingerprint matches — typical fast-rejection path touches only 1 byte).
133 * ========================================================================= */
134
135static inline bool hashset_contains(const hashset_t* set, const void* key) {
136 if (!set || !key) return false;
137
138 const uint64_t hash = set->hash_fn(key, set->key_size);
139 const size_t mask = set->capacity - 1;
140 const uint8_t fp = _HS_FINGERPRINT(hash);
141 size_t idx = (size_t)(hash & mask);
142
143 for (size_t i = 0; i < set->capacity; i++) {
144 const uint8_t m = set->meta[idx];
145
146 if (m == _HS_EMPTY) return false; /* cluster ends, key not present */
147
148 /* Skip DELETED slots — back-shift avoids them but rehash edge cases
149 * can leave one; probing must continue past them. */
150 if (m != _HS_DELETED && m == fp && set->equals_fn(_hs_key(set, idx), key, set->key_size)) return true;
151
152 idx = _hs_slot(idx, 1, mask);
153 }
154 return false;
155}
156
157/* =========================================================================
158 * Rehash (internal)
159 * ========================================================================= */
160
161static inline bool _hs_rehash(hashset_t* set, size_t new_cap) {
162 uint8_t* new_meta = (uint8_t*)calloc(new_cap, sizeof(uint8_t));
163 char* new_keys = (char*)malloc(new_cap * set->key_size);
164 if (!new_meta || !new_keys) {
165 free(new_meta);
166 free(new_keys);
167 return false;
168 }
169
170 const size_t mask = new_cap - 1;
171
172 for (size_t i = 0; i < set->capacity; i++) {
173 if (set->meta[i] < 0x02u) continue; /* EMPTY or DELETED */
174
175 const void* k = _hs_key(set, i);
176 const uint64_t h = set->hash_fn(k, set->key_size);
177 const uint8_t fp = _HS_FINGERPRINT(h);
178 size_t idx = (size_t)(h & mask);
179
180 /* Linear probe in new table (no deletions yet, so no DELETED slots). */
181 while (new_meta[idx] != _HS_EMPTY) idx = (idx + 1) & mask;
182
183 new_meta[idx] = fp;
184 memcpy(new_keys + idx * set->key_size, k, set->key_size);
185 }
186
187 free(set->meta);
188 free(set->keys);
189 set->meta = new_meta;
190 set->keys = new_keys;
191 set->capacity = new_cap;
192 return true;
193}
194
195/* =========================================================================
196 * Insert
197 * ========================================================================= */
198
199static inline bool hashset_add(hashset_t* set, const void* key) {
200 if (!set || !key) return false;
201
202 /* Grow before inserting if load would exceed 75 %. */
203 if ((double)(set->size + 1) / (double)set->capacity > HASHSET_LOAD_FACTOR) {
204 if (!_hs_rehash(set, set->capacity * 2)) return false;
205 }
206
207 const uint64_t hash = set->hash_fn(key, set->key_size);
208 const size_t mask = set->capacity - 1;
209 const uint8_t fp = _HS_FINGERPRINT(hash);
210 size_t idx = (size_t)(hash & mask);
211 size_t first_del = SIZE_MAX; /* first DELETED slot seen */
212
213 for (size_t i = 0; i < set->capacity; i++) {
214 const uint8_t m = set->meta[idx];
215
216 if (m == _HS_EMPTY) {
217 /* Use earlier DELETED slot if one was found. */
218 size_t ins = (first_del != SIZE_MAX) ? first_del : idx;
219 set->meta[ins] = fp;
220 memcpy(_hs_key(set, ins), key, set->key_size);
221 set->size++;
222 return true;
223 }
224
225 if (m == _HS_DELETED) {
226 if (first_del == SIZE_MAX) first_del = idx;
227 } else if (m == fp && set->equals_fn(_hs_key(set, idx), key, set->key_size)) {
228 return true; /* already present */
229 }
230
231 idx = _hs_slot(idx, 1, mask);
232 }
233
234 /* Table full (shouldn't happen at 75 % load) — insert into first_del. */
235 if (first_del != SIZE_MAX) {
236 set->meta[first_del] = fp;
237 memcpy(_hs_key(set, first_del), key, set->key_size);
238 set->size++;
239 return true;
240 }
241 return false;
242}
243
244/* =========================================================================
245 * Remove — backward-shift deletion (no tombstone accumulation)
246 *
247 * After evicting a slot, walk forward and pull back any element whose
248 * "natural" position is at or before the vacated slot, so future probes
249 * never need to leap over a DELETED sentinel.
250 * ========================================================================= */
251
252static inline bool hashset_remove(hashset_t* set, const void* key) {
253 if (!set || !key) return false;
254
255 const uint64_t hash = set->hash_fn(key, set->key_size);
256 const size_t mask = set->capacity - 1;
257 const uint8_t fp = _HS_FINGERPRINT(hash);
258 size_t idx = (size_t)(hash & mask);
259
260 /* Locate the element, skipping DELETED slots. */
261 size_t pos = SIZE_MAX;
262 for (size_t i = 0; i < set->capacity; i++) {
263 const uint8_t m = set->meta[idx];
264 if (m == _HS_EMPTY) return false; /* cluster ends */
265 if (m != _HS_DELETED && m == fp && set->equals_fn(_hs_key(set, idx), key, set->key_size)) {
266 pos = idx;
267 break;
268 }
269 idx = _hs_slot(idx, 1, mask);
270 }
271 if (pos == SIZE_MAX) return false;
272
273 /*
274 * Backward-shift deletion — no tombstones.
275 *
276 * Two cursors:
277 * hole = the empty slot to be filled; starts at pos, advances only
278 * when an element is shifted into it.
279 * scan = lookahead cursor that advances unconditionally every step.
280 *
281 * For each slot at `scan`:
282 * • EMPTY → cluster ends, stop.
283 * • d_hole < d_scan → element is displaced further than hole is from
284 * its natural slot; moving it left to hole brings it closer. Shift
285 * it and advance hole to scan.
286 * • otherwise → leave element in place; hole stays, scan continues.
287 *
288 * CRITICAL: hole and scan must be separate variables. If next were
289 * always computed as (hole+1), skipping an element would cause the same
290 * slot to be re-examined every iteration. The dual-cursor ensures scan
291 * always makes forward progress regardless of whether we shifted.
292 *
293 * We must NOT break on d_scan==0 (element in natural slot). Consider:
294 * [i] nat=i dist=0 <- being removed
295 * [i+1] nat=i+1 dist=0 <- in natural slot; d_hole>d_scan -> no shift
296 * [i+2] nat=i dist=2 <- still needs pulling to i; would be orphaned
297 * if we broke at i+1.
298 */
299 size_t hole = pos;
300 size_t scan = (pos + 1) & mask;
301
302 for (size_t i = 0; i < set->capacity - 1; i++, scan = (scan + 1) & mask) {
303 if (set->meta[scan] < 0x02u) break; /* EMPTY or DELETED: cluster ends */
304
305 uint64_t sh = set->hash_fn(_hs_key(set, scan), set->key_size);
306 size_t s_nat = (size_t)(sh & mask);
307 size_t d_scan = (scan - s_nat) & mask;
308 size_t d_hole = (hole - s_nat) & mask;
309
310 if (d_hole < d_scan) {
311 set->meta[hole] = set->meta[scan];
312 memcpy(_hs_key(set, hole), _hs_key(set, scan), set->key_size);
313 hole = scan;
314 }
315 /* else: skip — hole stays, scan advances via loop increment */
316 }
317
318 set->meta[hole] = _HS_EMPTY;
319 set->size--;
320 return true;
321}
322
323/* =========================================================================
324 * Accessors
325 * ========================================================================= */
326
327static inline size_t hashset_size(const hashset_t* s) { return s ? s->size : 0; }
328static inline size_t hashset_capacity(const hashset_t* s) { return s ? s->capacity : 0; }
329static inline bool hashset_isempty(const hashset_t* s) { return !s || s->size == 0; }
330
331static inline void hashset_clear(hashset_t* set) {
332 if (!set) return;
333 memset(set->meta, 0, set->capacity);
334 set->size = 0;
335}
336
337/* =========================================================================
338 * Set operations (unchanged API, reuse primitive operations)
339 * ========================================================================= */
340
341static inline hashset_t* hashset_union(const hashset_t* A, const hashset_t* B) {
342 if (!A || !B || A->key_size != B->key_size) return NULL;
343 hashset_t* r = hashset_create(A->key_size, A->size + B->size, A->hash_fn, A->equals_fn);
344 if (!r) return NULL;
345 for (size_t i = 0; i < A->capacity; i++)
346 if (A->meta[i] >= 0x02u) hashset_add(r, _hs_key(A, i));
347 for (size_t i = 0; i < B->capacity; i++)
348 if (B->meta[i] >= 0x02u) hashset_add(r, _hs_key(B, i));
349 return r;
350}
351
352static inline hashset_t* hashset_intersection(const hashset_t* A, const hashset_t* B) {
353 if (!A || !B || A->key_size != B->key_size) return NULL;
354 const hashset_t* small = A->size <= B->size ? A : B;
355 const hashset_t* large = A->size <= B->size ? B : A;
356 hashset_t* r = hashset_create(A->key_size, small->size, A->hash_fn, A->equals_fn);
357 if (!r) return NULL;
358 for (size_t i = 0; i < small->capacity; i++)
359 if (small->meta[i] >= 0x02u && hashset_contains(large, _hs_key(small, i))) hashset_add(r, _hs_key(small, i));
360 return r;
361}
362
363static inline hashset_t* hashset_difference(const hashset_t* A, const hashset_t* B) {
364 if (!A || !B || A->key_size != B->key_size) return NULL;
365 hashset_t* r = hashset_create(A->key_size, A->size, A->hash_fn, A->equals_fn);
366 if (!r) return NULL;
367 for (size_t i = 0; i < A->capacity; i++)
368 if (A->meta[i] >= 0x02u && !hashset_contains(B, _hs_key(A, i))) hashset_add(r, _hs_key(A, i));
369 return r;
370}
371
372static inline hashset_t* hashset_symmetric_difference(const hashset_t* A, const hashset_t* B) {
373 if (!A || !B || A->key_size != B->key_size) return NULL;
374 hashset_t* r = hashset_create(A->key_size, A->size + B->size, A->hash_fn, A->equals_fn);
375 if (!r) return NULL;
376 for (size_t i = 0; i < A->capacity; i++)
377 if (A->meta[i] >= 0x02u && !hashset_contains(B, _hs_key(A, i))) hashset_add(r, _hs_key(A, i));
378 for (size_t i = 0; i < B->capacity; i++)
379 if (B->meta[i] >= 0x02u && !hashset_contains(A, _hs_key(B, i))) hashset_add(r, _hs_key(B, i));
380 return r;
381}
382
383static inline bool hashset_is_subset(const hashset_t* A, const hashset_t* B) {
384 if (!A || !B || A->key_size != B->key_size) return false;
385 if (A->size > B->size) return false;
386 for (size_t i = 0; i < A->capacity; i++)
387 if (A->meta[i] >= 0x02u && !hashset_contains(B, _hs_key(A, i))) return false;
388 return true;
389}
390
391static inline bool hashset_is_proper_subset(const hashset_t* A, const hashset_t* B) {
392 if (!A || !B) return false;
393 return A->size < B->size && hashset_is_subset(A, B);
394}
395
396#if defined(__cplusplus)
397}
398#endif
399
400#endif /* __HASHSET_H__ */