commit cbc287dcbb29ad321dca5cd14c31998279205243 from: Stefan Sperling date: Tue Apr 19 20:08:41 2022 UTC reimplement object-ID set data structure on top of a hash table Siphash suggested by jrick as a better alternative to murmurhash for this use case. with small fixes from and ok op@ commit - 3313bcd840cd85a1b349abf664992608a42bbbde commit + cbc287dcbb29ad321dca5cd14c31998279205243 blob - 5346f7f2ce1559b966b7c6e2f1bde6da40d729a4 blob + 750be4962021434fb27d2dcc95867ff2e333a776 --- lib/got_lib_object_idset.h +++ lib/got_lib_object_idset.h @@ -31,13 +31,3 @@ const struct got_error *got_object_idset_for_each(stru const struct got_error *(*cb)(struct got_object_id *, void *, void *), void *); int got_object_idset_num_elements(struct got_object_idset *); - -struct got_object_idset_element; -struct got_object_idset_element *got_object_idset_get_element( - struct got_object_idset *, struct got_object_id *); -void *got_object_idset_get_element_data(struct got_object_idset_element *); -const struct got_error *got_object_idset_for_each_element(struct got_object_idset *, - const struct got_error *(*cb)(struct got_object_idset_element *, void *), void *); -void got_object_idset_remove_element(struct got_object_idset *, - struct got_object_idset_element *); - blob - 06f7347e22beeabe835c116ac99d59e61f87ea68 blob + e27d30ce0887da3caae5375fdd367eb524ddac3e --- lib/object_idset.c +++ lib/object_idset.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2019 Stefan Sperling + * Copyright (c) 2022 Stefan Sperling * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -15,15 +15,17 @@ */ #include -#include #include +#include #include #include #include #include #include #include +#include +#include #include "got_object.h" #include "got_error.h" @@ -32,116 +34,185 @@ #include "got_lib_inflate.h" #include "got_lib_object.h" #include "got_lib_object_idset.h" +#include "got_lib_object_parse.h" -struct got_object_idset_element { - RB_ENTRY(got_object_idset_element) entry; - struct got_object_id id; - void *data; /* API user data */ -}; +#define GOT_OBJECT_IDSET_MIN_BUCKETS 64 -RB_HEAD(got_object_idset_tree, got_object_idset_element); - -static int -cmp_elements(const struct got_object_idset_element *e1, - const struct got_object_idset_element *e2) -{ - return got_object_id_cmp(&e1->id, &e2->id); -} - -RB_PROTOTYPE(got_object_idset_tree, got_object_idset_element, entry, - cmp_elements); - struct got_object_idset { - struct got_object_idset_tree entries; - int totelem; -#define GOT_OBJECT_IDSET_MAX_ELEM INT_MAX + struct got_object_id_queue *ids; + size_t nbuckets; + unsigned int totelem; + unsigned int flags; +#define GOT_OBJECT_IDSET_F_TRAVERSAL 0x01 +#define GOT_OBJECT_IDSET_F_NOMEM 0x02 + SIPHASH_KEY key; }; struct got_object_idset * got_object_idset_alloc(void) { struct got_object_idset *set; + int i; set = malloc(sizeof(*set)); if (set == NULL) return NULL; - RB_INIT(&set->entries); - set->totelem = 0; + set->ids = calloc(sizeof(set->ids[0]), GOT_OBJECT_IDSET_MIN_BUCKETS); + if (set->ids == NULL) { + free(set); + return NULL; + } + for (i = 0; i < GOT_OBJECT_IDSET_MIN_BUCKETS; i++) + STAILQ_INIT(&set->ids[i]); + set->totelem = 0; + set->nbuckets = GOT_OBJECT_IDSET_MIN_BUCKETS; + set->flags = 0; + arc4random_buf(&set->key, sizeof(set->key)); return set; } void got_object_idset_free(struct got_object_idset *set) { - struct got_object_idset_element *entry; + size_t i; + struct got_object_qid *qid; - while ((entry = RB_MIN(got_object_idset_tree, &set->entries))) { - RB_REMOVE(got_object_idset_tree, &set->entries, entry); - /* User data should be freed by caller. */ - free(entry); + for (i = 0; i < set->nbuckets; i++) { + while (!STAILQ_EMPTY(&set->ids[i])) { + qid = STAILQ_FIRST(&set->ids[i]); + STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry); + got_object_qid_free(qid); + } } - + /* User data should be freed by caller. */ + free(set->ids); free(set); } -const struct got_error * -got_object_idset_add(struct got_object_idset *set, struct got_object_id *id, - void *data) +static uint64_t +idset_hash(struct got_object_idset *set, struct got_object_id *id) { - struct got_object_idset_element *new; + return SipHash24(&set->key, id->sha1, sizeof(id->sha1)); +} - if (set->totelem >= GOT_OBJECT_IDSET_MAX_ELEM) - return got_error(GOT_ERR_NO_SPACE); +static const struct got_error * +idset_resize(struct got_object_idset *set, size_t nbuckets) +{ + struct got_object_id_queue *ids; + size_t i; - new = malloc(sizeof(*new)); - if (new == NULL) - return got_error_from_errno("malloc"); + ids = calloc(nbuckets, sizeof(ids[0])); + if (ids == NULL) { + if (errno != ENOMEM) + return got_error_from_errno("calloc"); + /* Proceed with our current amount of hash buckets. */ + set->flags |= GOT_OBJECT_IDSET_F_NOMEM; + return NULL; + } - memcpy(&new->id, id, sizeof(new->id)); - new->data = data; + for (i = 0; i < nbuckets; i++) + STAILQ_INIT(&ids[i]); - if (RB_INSERT(got_object_idset_tree, &set->entries, new) != NULL) { - free(new); - return got_error(GOT_ERR_OBJ_EXISTS); + arc4random_buf(&set->key, sizeof(set->key)); + + for (i = 0; i < set->nbuckets; i++) { + while (!STAILQ_EMPTY(&set->ids[i])) { + struct got_object_qid *qid; + uint64_t idx; + qid = STAILQ_FIRST(&set->ids[i]); + STAILQ_REMOVE(&set->ids[i], qid, got_object_qid, entry); + idx = idset_hash(set, qid->id) % nbuckets; + STAILQ_INSERT_HEAD(&ids[idx], qid, entry); + } } - set->totelem++; + free(set->ids); + set->ids = ids; + set->nbuckets = nbuckets; return NULL; } -static struct got_object_idset_element * -find_element(struct got_object_idset *set, struct got_object_id *id) +static const struct got_error * +idset_grow(struct got_object_idset *set) { - struct got_object_idset_element *entry; + size_t nbuckets; - entry = RB_ROOT(&set->entries); - while (entry) { - int cmp = got_object_id_cmp(id, &entry->id); - if (cmp < 0) - entry = RB_LEFT(entry, entry); - else if (cmp > 0) - entry = RB_RIGHT(entry, entry); - else - break; + if (set->flags & GOT_OBJECT_IDSET_F_NOMEM) + return NULL; + + if (set->nbuckets >= UINT_MAX / 2) + nbuckets = UINT_MAX; + else + nbuckets = set->nbuckets * 2; + + return idset_resize(set, nbuckets); +} + +const struct got_error * +got_object_idset_add(struct got_object_idset *set, struct got_object_id *id, + void *data) +{ + const struct got_error *err; + struct got_object_qid *qid; + uint64_t idx; + struct got_object_id_queue *head; + + /* This function may resize the set. */ + if (set->flags & GOT_OBJECT_IDSET_F_TRAVERSAL) + return got_error_msg(GOT_ERR_NOT_IMPL, + "cannot add elements to idset during traversal"); + + if (set->totelem == UINT_MAX) + return got_error(GOT_ERR_NO_SPACE); + + err = got_object_qid_alloc_partial(&qid); + if (err) + return err; + memcpy(qid->id, id, sizeof(*qid->id)); + qid->data = data; + + idx = idset_hash(set, id) % set->nbuckets; + head = &set->ids[idx]; + STAILQ_INSERT_HEAD(head, qid, entry); + set->totelem++; + + if (set->nbuckets < set->totelem) + err = idset_grow(set); + + return err; +} + +static struct got_object_qid * +find_element(struct got_object_idset *set, struct got_object_id *id) +{ + uint64_t idx = idset_hash(set, id) % set->nbuckets; + struct got_object_id_queue *head = &set->ids[idx]; + struct got_object_qid *qid; + + STAILQ_FOREACH(qid, head, entry) { + if (got_object_id_cmp(qid->id, id) == 0) + return qid; } - return entry; + return NULL; } void * got_object_idset_get(struct got_object_idset *set, struct got_object_id *id) { - struct got_object_idset_element *entry = find_element(set, id); - return entry ? entry->data : NULL; + struct got_object_qid *qid = find_element(set, id); + return qid ? qid->data : NULL; } const struct got_error * got_object_idset_remove(void **data, struct got_object_idset *set, struct got_object_id *id) { - struct got_object_idset_element *entry; + uint64_t idx; + struct got_object_id_queue *head; + struct got_object_qid *qid; if (data) *data = NULL; @@ -150,20 +221,30 @@ got_object_idset_remove(void **data, struct got_object return got_error(GOT_ERR_NO_OBJ); if (id == NULL) { - entry = RB_ROOT(&set->entries); - if (entry == NULL) - return got_error(GOT_ERR_NO_OBJ); + /* Remove a "random" element. */ + for (idx = 0; idx < set->nbuckets; idx++) { + head = &set->ids[idx]; + qid = STAILQ_FIRST(head); + if (qid) + break; + } } else { - entry = find_element(set, id); - if (entry == NULL) + idx = idset_hash(set, id) % set->nbuckets; + head = &set->ids[idx]; + STAILQ_FOREACH(qid, head, entry) { + if (got_object_id_cmp(qid->id, id) == 0) + break; + } + if (qid == NULL) return got_error_no_obj(id); } - RB_REMOVE(got_object_idset_tree, &set->entries, entry); if (data) - *data = entry->data; - free(entry); + *data = qid->data; + STAILQ_REMOVE(head, qid, got_object_qid, entry); + got_object_qid_free(qid); set->totelem--; + return NULL; } @@ -171,8 +252,8 @@ int got_object_idset_contains(struct got_object_idset *set, struct got_object_id *id) { - struct got_object_idset_element *entry = find_element(set, id); - return entry ? 1 : 0; + struct got_object_qid *qid = find_element(set, id); + return qid ? 1 : 0; } const struct got_error * @@ -180,15 +261,23 @@ got_object_idset_for_each(struct got_object_idset *set const struct got_error *(*cb)(struct got_object_id *, void *, void *), void *arg) { - const struct got_error *err; - struct got_object_idset_element *entry, *tmp; + const struct got_error *err = NULL; + struct got_object_id_queue *head; + struct got_object_qid *qid, *tmp; + size_t i; - RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) { - err = (*cb)(&entry->id, entry->data, arg); - if (err) - return err; + set->flags |= GOT_OBJECT_IDSET_F_TRAVERSAL; + for (i = 0; i < set->nbuckets; i++) { + head = &set->ids[i]; + STAILQ_FOREACH_SAFE(qid, head, entry, tmp) { + err = (*cb)(qid->id, qid->data, arg); + if (err) + goto done; + } } - return NULL; +done: + set->flags &= ~GOT_OBJECT_IDSET_F_TRAVERSAL; + return err; } int @@ -196,43 +285,3 @@ got_object_idset_num_elements(struct got_object_idset { return set->totelem; } - -struct got_object_idset_element * -got_object_idset_get_element(struct got_object_idset *set, struct got_object_id *id) -{ - return find_element(set, id); -} - -void * -got_object_idset_get_element_data(struct got_object_idset_element *entry) -{ - return entry->data; -} - -const struct got_error * -got_object_idset_for_each_element(struct got_object_idset *set, - const struct got_error *(*cb)(struct got_object_idset_element *, void *), - void *arg) -{ - const struct got_error *err; - struct got_object_idset_element *entry, *tmp; - - RB_FOREACH_SAFE(entry, got_object_idset_tree, &set->entries, tmp) { - err = (*cb)(entry, arg); - if (err) - return err; - } - return NULL; -} - -void -got_object_idset_remove_element(struct got_object_idset *set, - struct got_object_idset_element *entry) -{ - RB_REMOVE(got_object_idset_tree, &set->entries, entry); - free(entry); - set->totelem--; -} - -RB_GENERATE(got_object_idset_tree, got_object_idset_element, entry, - cmp_elements); blob - fa7ea77d02fc1d6284318e12d67fc4c0742fde70 blob + 81518d57e3cdb40c14904ec873d086182fea8b4c --- lib/pack_create.c +++ lib/pack_create.c @@ -1777,25 +1777,24 @@ done: } static const struct got_error * -remove_unused_object(struct got_object_idset_element *entry, void *arg) +remove_unused_object(struct got_object_id *id, void *data, void *arg) { struct got_object_idset *idset = arg; - if (got_object_idset_get_element_data(entry) == NULL) - got_object_idset_remove_element(idset, entry); + if (data == NULL) + got_object_idset_remove(NULL, idset, id); return NULL; } static const struct got_error * -remove_reused_object(struct got_object_idset_element *entry, void *arg) +remove_reused_object(struct got_object_id *id, void *data, void *arg) { struct got_object_idset *idset = arg; - struct got_pack_meta *m; + struct got_pack_meta *m = data; - m = got_object_idset_get_element_data(entry); if (m->have_reused_delta) - got_object_idset_remove_element(idset, entry); + got_object_idset_remove(NULL, idset, id); return NULL; } @@ -1840,8 +1839,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile, if (err) return err; - err = got_object_idset_for_each_element(idset, - remove_unused_object, idset); + err = got_object_idset_for_each(idset, remove_unused_object, idset); if (err) goto done; @@ -1877,7 +1875,7 @@ got_pack_create(uint8_t *packsha1, FILE *packfile, if (err) goto done; if (reuse.nmeta > 0) { - err = got_object_idset_for_each_element(idset, + err = got_object_idset_for_each(idset, remove_reused_object, idset); if (err) goto done;