From 507ca362df996d798f5a301152cd865c463392e0 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Sun, 23 Mar 2025 19:15:53 +0100
Subject: [PATCH 01/12] add lookup or insert

---
 src/hash.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/hash.c b/src/hash.c
index 5ac0c23129..5f5544cd4f 100644
--- a/src/hash.c
+++ b/src/hash.c
@@ -95,6 +95,30 @@ R_xlen_t hash_lookup(const hashtab * h, SEXP key, R_xlen_t ifnotfound) {
   return ifnotfound; // # nocov
 }
 
+R_xlen_t hash_lookup_or_insert(hashtab *h, SEXP key, R_xlen_t value) {
+  struct hash_pair *cell = h->tb + hash_index(key, h->multiplier) % h->size, *end = h->tb + h->size - 1;
+  for (size_t i = 0; i < h->size; ++i, cell = (cell == end ? h->tb : cell + 1)) {
+    if (cell->key == key) {
+      cell->value = value;
+      return cell->value;
+    } else if (!cell->key) {
+      if (!h->free) internal_error(
+        __func__, "no free slots left (full size=%zu)", h->size
+      );
+      --h->free;
+      *cell = (struct hash_pair){.key = key, .value = value};
+      return value;  // insert here
+    }
+  }
+
+  internal_error( // # nocov
+    __func__, "did not find a free slot for key %p; size=%zu, free=%zu",
+    (void*)key, h->size, h->free
+  );
+  // Should be impossible, but just in case:
+  return value;
+}
+
 typedef struct dhashtab_ {
   dhashtab public; // must be at offset 0
   size_t size, used, limit;

From 47319c3189ab3ee1b57530d5e896880fc91962c2 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Sun, 23 Mar 2025 19:16:55 +0100
Subject: [PATCH 02/12] use lookup or insert

---
 src/chmatch.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/chmatch.c b/src/chmatch.c
index ac3851b1f1..5d7d75925c 100644
--- a/src/chmatch.c
+++ b/src/chmatch.c
@@ -59,8 +59,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
   int nuniq=0;
   for (int i=0; i<tablelen; ++i) {
     const SEXP s = td[i];
-    int tl = hash_lookup(marks, s, 0);
-    if (tl==0) hash_set(marks, s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table
+    hash_set(marks, s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table
   }
   // in future if we need NAs in x not to be matched to NAs in table ...
   if (chmatchdup) {

From b7d126c94e7a9eb4b49af6051c9048be4f000663 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Sun, 23 Mar 2025 19:38:48 +0100
Subject: [PATCH 03/12] use lookup_or_insert

---
 src/data.table.h | 2 ++
 src/hash.c       | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/data.table.h b/src/data.table.h
index f9e502be87..1456008ad5 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -294,6 +294,8 @@ hashtab * hash_create(size_t n);
 void hash_set(hashtab *, SEXP key, R_xlen_t value);
 // Returns the value corresponding to the key present in the hash, otherwise returns ifnotfound.
 R_xlen_t hash_lookup(const hashtab *, SEXP key, R_xlen_t ifnotfound);
+// Returns the value corresponding to the key present in the hash, otherwise inserts value.
+R_xlen_t hash_lookup_or_insert(hashtab *, SEXP key, R_xlen_t value);
 
 // The dynamically-allocated hash table has a public field for the R protection wrapper.
 // Keep it PROTECTed while the table is in use.
diff --git a/src/hash.c b/src/hash.c
index 5f5544cd4f..bce932cab6 100644
--- a/src/hash.c
+++ b/src/hash.c
@@ -99,8 +99,7 @@ R_xlen_t hash_lookup_or_insert(hashtab *h, SEXP key, R_xlen_t value) {
   struct hash_pair *cell = h->tb + hash_index(key, h->multiplier) % h->size, *end = h->tb + h->size - 1;
   for (size_t i = 0; i < h->size; ++i, cell = (cell == end ? h->tb : cell + 1)) {
     if (cell->key == key) {
-      cell->value = value;
-      return cell->value;
+      return cell->value; // found key, only lookup, no insert
     } else if (!cell->key) {
       if (!h->free) internal_error(
         __func__, "no free slots left (full size=%zu)", h->size

From 337a0c2d508a31c59885416d7929ff6d6a4b0bda Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Sun, 23 Mar 2025 19:44:41 +0100
Subject: [PATCH 04/12] really use lookup or insert

---
 src/chmatch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/chmatch.c b/src/chmatch.c
index 5d7d75925c..5d39d6d217 100644
--- a/src/chmatch.c
+++ b/src/chmatch.c
@@ -59,7 +59,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
   int nuniq=0;
   for (int i=0; i<tablelen; ++i) {
     const SEXP s = td[i];
-    hash_set(marks, s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table
+    hash_lookup_or_insert(marks, s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table
   }
   // in future if we need NAs in x not to be matched to NAs in table ...
   if (chmatchdup) {

From acdb8993a8b32b32ef0074755813642b566e8c9a Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Sun, 23 Mar 2025 20:20:12 +0100
Subject: [PATCH 05/12] use cuckoo hashing

---
 src/chmatch.c    |   3 +-
 src/data.table.h |   2 -
 src/hash.c       | 109 +++++++++++++++++++++--------------------------
 3 files changed, 51 insertions(+), 63 deletions(-)

diff --git a/src/chmatch.c b/src/chmatch.c
index 5d39d6d217..ac3851b1f1 100644
--- a/src/chmatch.c
+++ b/src/chmatch.c
@@ -59,7 +59,8 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
   int nuniq=0;
   for (int i=0; i<tablelen; ++i) {
     const SEXP s = td[i];
-    hash_lookup_or_insert(marks, s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table
+    int tl = hash_lookup(marks, s, 0);
+    if (tl==0) hash_set(marks, s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table
   }
   // in future if we need NAs in x not to be matched to NAs in table ...
   if (chmatchdup) {
diff --git a/src/data.table.h b/src/data.table.h
index 1456008ad5..f9e502be87 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -294,8 +294,6 @@ hashtab * hash_create(size_t n);
 void hash_set(hashtab *, SEXP key, R_xlen_t value);
 // Returns the value corresponding to the key present in the hash, otherwise returns ifnotfound.
 R_xlen_t hash_lookup(const hashtab *, SEXP key, R_xlen_t ifnotfound);
-// Returns the value corresponding to the key present in the hash, otherwise inserts value.
-R_xlen_t hash_lookup_or_insert(hashtab *, SEXP key, R_xlen_t value);
 
 // The dynamically-allocated hash table has a public field for the R protection wrapper.
 // Keep it PROTECTed while the table is in use.
diff --git a/src/hash.c b/src/hash.c
index bce932cab6..1a96866aa7 100644
--- a/src/hash.c
+++ b/src/hash.c
@@ -8,12 +8,13 @@ struct hash_pair {
 };
 struct hash_tab {
   size_t size, free;
-  uintptr_t multiplier;
-  struct hash_pair tb[];
+  uintptr_t multiplier1, multiplier2;
+  struct hash_pair *tb1, *tb2;
 };
 
 // TAOCP vol. 3, section 6.4: for multiplication hashing, use A ~ 1/phi, the golden ratio.
-static const double hash_multiplier = 0.618033988749895;
+static const double hash_multiplier1 = 0.618033988749895;
+static const double hash_multiplier2 = 0.316227766016838;
 
 static R_INLINE size_t get_full_size(size_t n_elements, double load_factor) {
   if (load_factor <= 0 || load_factor >= 1)
@@ -39,14 +40,19 @@ static hashtab * hash_create_(size_t n, double load_factor) {
       __func__, "n=%zu with load_factor=%g would overflow total allocation size",
       n, load_factor
     );
-  hashtab * ret = (hashtab *)R_alloc(sizeof(hashtab) + sizeof(struct hash_pair[n_full]), 1);
+  hashtab *ret = (hashtab *)R_alloc(sizeof(hashtab), 1);
   ret->size = n_full;
   ret->free = n;
   // To compute floor(size * (A * key % 1)) in integer arithmetic with A < 1, use ((size * A) * key) % size.
-  ret->multiplier = n_full * hash_multiplier;
+  ret->multiplier1 = n_full * hash_multiplier1;
+  ret->multiplier2 = n_full * hash_multiplier2;
+  ret->tb1 = (struct hash_pair *)R_alloc(sizeof(struct hash_pair[n_full]), 1);
+  ret->tb2 = (struct hash_pair *)R_alloc(sizeof(struct hash_pair[n_full]), 1);
   // No valid SEXP is a null pointer, so it's a safe marker for empty cells.
-  for (size_t i = 0; i < n_full; ++i)
-    ret->tb[i].key = NULL;
+  for (size_t i = 0; i < n_full; ++i) {
+    ret->tb1[i].key = NULL;
+    ret->tb2[i].key = NULL;
+  }
   return ret;
 }
 
@@ -54,70 +60,53 @@ hashtab * hash_create(size_t n) { return hash_create_(n, .5); }
 
 // Hashing for an open addressing hash table. See Cormen et al., Introduction to Algorithms, 3rd ed., section 11.4.
 // This is far from perfect. Make size a prime or a power of two and you'll be able to use double hashing.
-static R_INLINE size_t hash_index(SEXP key, uintptr_t multiplier) {
+static R_INLINE size_t hash_index1(SEXP key, uintptr_t multiplier) {
   // The 4 lowest bits of the pointer are probably zeroes because a typical SEXPREC exceeds 16 bytes in size.
   // Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees,
   // which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes.
   return ((((uintptr_t)key) >> 4) & 0x0fffffff) * multiplier;
 }
 
-void hash_set(hashtab * h, SEXP key, R_xlen_t value) {
-  struct hash_pair *cell = h->tb + hash_index(key, h->multiplier) % h->size, *end = h->tb + h->size - 1;
-  for (size_t i = 0; i < h->size; ++i, cell = cell == end ? h->tb : cell+1) {
-    if (cell->key == key) {
-      cell->value = value;
+static R_INLINE size_t hash_index2(SEXP key, uintptr_t multiplier) {
+    return ((((uintptr_t)key) >> 6) & 0x0fffffff) * multiplier;
+}
+
+
+void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
+  size_t max_relocations = h->size; 
+  struct hash_pair item = { .key = key, .value = value };
+  for (size_t i = 0; i < max_relocations; ++i) {
+    size_t idx1 = hash_index1(item.key, h->multiplier1) % h->size;
+    if (!h->tb1[idx1].key) {
+      h->tb1[idx1] = item;
       return;
-    } else if (!cell->key) {
-      if (!h->free) internal_error(
-        __func__, "no free slots left (full size=%zu)", h->size
-      );
-      --h->free;
-      *cell = (struct hash_pair){.key = key, .value = value};
+    }
+    struct hash_pair temp = h->tb1[idx1];
+    h->tb1[idx1] = item;
+    item = temp;    
+    
+    size_t idx2 = hash_index2(item.key, h->multiplier2) % h->size;
+    if (!h->tb2[idx2].key) {
+      h->tb2[idx2] = item;
       return;
     }
+    temp = h->tb2[idx2];
+    h->tb2[idx2] = item;
+    item = temp;
   }
-  internal_error( // # nocov
-    __func__, "did not find a free slot for key %p; size=%zu, free=%zu",
-    (void*)key, h->size, h->free
-  );
+  internal_error(__func__, "Cuckoo hashing cycle detected, rehash needed");
 }
 
-R_xlen_t hash_lookup(const hashtab * h, SEXP key, R_xlen_t ifnotfound) {
-  const struct hash_pair * cell = h->tb + hash_index(key, h->multiplier) % h->size, *end = h->tb + h->size - 1;
-  for (size_t i = 0; i < h->size; ++i, cell = cell == end ? h->tb : cell+1) {
-    if (cell->key == key) {
-      return cell->value;
-    } else if (!cell->key) {
-      return ifnotfound;
-    }
-  }
+R_xlen_t hash_lookup(const hashtab *h, SEXP key, R_xlen_t ifnotfound) {
+  size_t idx1 = hash_index1(key, h->multiplier1) % h->size;
+  if (h->tb1[idx1].key == key) return h->tb1[idx1].value;
+    
+  size_t idx2 = hash_index2(key, h->multiplier2) % h->size;
+  if (h->tb2[idx2].key == key) return h->tb2[idx2].value;
   // Should be impossible with a load factor below 1, but just in case:
   return ifnotfound; // # nocov
 }
 
-R_xlen_t hash_lookup_or_insert(hashtab *h, SEXP key, R_xlen_t value) {
-  struct hash_pair *cell = h->tb + hash_index(key, h->multiplier) % h->size, *end = h->tb + h->size - 1;
-  for (size_t i = 0; i < h->size; ++i, cell = (cell == end ? h->tb : cell + 1)) {
-    if (cell->key == key) {
-      return cell->value; // found key, only lookup, no insert
-    } else if (!cell->key) {
-      if (!h->free) internal_error(
-        __func__, "no free slots left (full size=%zu)", h->size
-      );
-      --h->free;
-      *cell = (struct hash_pair){.key = key, .value = value};
-      return value;  // insert here
-    }
-  }
-
-  internal_error( // # nocov
-    __func__, "did not find a free slot for key %p; size=%zu, free=%zu",
-    (void*)key, h->size, h->free
-  );
-  // Should be impossible, but just in case:
-  return value;
-}
-
 typedef struct dhashtab_ {
   dhashtab public; // must be at offset 0
   size_t size, used, limit;
@@ -158,7 +147,7 @@ static dhashtab * dhash_create_(size_t n, double load_factor) {
   self->table = dhash_allocate(n_full);
   self->size = n_full;
   self->limit = n;
-  self->multiplier = n_full * hash_multiplier;
+  self->multiplier = n_full * hash_multiplier1;
   // this is the last time we're allowed to set the table parts piece by piece
 
   UNPROTECT(1);
@@ -172,10 +161,10 @@ static void dhash_enlarge(dhashtab_ * self) {
     internal_error(__func__, "doubling %zu elements would overflow size_t", self->size); // # nocov
   size_t new_size = self->size * 2;
   struct hash_pair * new = dhash_allocate(new_size);
-  uintptr_t new_multiplier = new_size * hash_multiplier;
+  uintptr_t new_multiplier = new_size * hash_multiplier1;
   for (size_t i = 0; i < self->size; ++i) {
     for (size_t j = 0; j < new_size; ++j) {
-      size_t ii = (hash_index(self->table[i].key, new_multiplier) + j) % new_size;
+      size_t ii = (hash_index1(self->table[i].key, new_multiplier) + j) % new_size;
       if (!new[ii].key) {
         new[ii] = (struct hash_pair){
           .key = self->table[i].key,
@@ -208,7 +197,7 @@ void dhash_set(dhashtab * h, SEXP key, R_xlen_t value) {
   dhashtab_ * self = (dhashtab_ *)h;
   struct hash_pair *cell, *end;
 again:
-  cell = self->table + hash_index(key, self->multiplier) % self->size;
+  cell = self->table + hash_index1(key, self->multiplier) % self->size;
   end = self->table + self->size - 1;
   for (size_t i = 0; i < self->size; ++i, cell = cell == end ? self->table : cell+1) {
     if (cell->key == key) {
@@ -234,7 +223,7 @@ R_xlen_t dhash_lookup(dhashtab * h, SEXP key, R_xlen_t ifnotfound) {
   #pragma omp flush // no locking or atomic access! this is bad
   dhashtab_ self = *(dhashtab_ *)h;
   R_xlen_t ret = ifnotfound;
-  const struct hash_pair * cell = self.table + hash_index(key, self.multiplier) % self.size;
+  const struct hash_pair * cell = self.table + hash_index1(key, self.multiplier) % self.size;
   const struct hash_pair * end = self.table + self.size - 1;
   for (size_t i = 0; i < self.size; ++i, cell = cell == end ? self.table : cell+1) {
     if (cell->key == key) {

From 09b3725acce257bbc6ef2cb55c36220528bc42e0 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Sun, 23 Mar 2025 20:27:52 +0100
Subject: [PATCH 06/12] add rehash

---
 src/hash.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/hash.c b/src/hash.c
index 1a96866aa7..438d894722 100644
--- a/src/hash.c
+++ b/src/hash.c
@@ -71,6 +71,16 @@ static R_INLINE size_t hash_index2(SEXP key, uintptr_t multiplier) {
     return ((((uintptr_t)key) >> 6) & 0x0fffffff) * multiplier;
 }
 
+void hash_rehash(hashtab *h) {
+  size_t new_size = h->size * 2;
+  hashtab *new_h = hash_create_(new_size, 0.5);
+
+  for (size_t i = 0; i < h->size; ++i) {
+    if (h->tb1[i].key) hash_set(new_h, h->tb1[i].key, h->tb1[i].value);
+    if (h->tb2[i].key) hash_set(new_h, h->tb2[i].key, h->tb2[i].value);
+  }
+    *h = *new_h;
+}
 
 void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
   size_t max_relocations = h->size; 
@@ -94,7 +104,9 @@ void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
     h->tb2[idx2] = item;
     item = temp;
   }
-  internal_error(__func__, "Cuckoo hashing cycle detected, rehash needed");
+  // need to rehash
+  hash_rehash(h);
+  hash_set(h, key, value);
 }
 
 R_xlen_t hash_lookup(const hashtab *h, SEXP key, R_xlen_t ifnotfound) {

From 7d4b0672cc03a701ce2a4699ff56b2244f6899bf Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Thu, 6 Nov 2025 11:10:17 +0100
Subject: [PATCH 07/12] use power of 2 and mask instead of modulo

---
 src/hash.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/hash.c b/src/hash.c
index 438d894722..f4aa0e7907 100644
--- a/src/hash.c
+++ b/src/hash.c
@@ -26,7 +26,15 @@ static R_INLINE size_t get_full_size(size_t n_elements, double load_factor) {
     __func__, "n=%zu / load_factor=%g would overflow size_t",
     n_elements, load_factor
   );
-  return ceil(n_elements / load_factor);
+  size_t min_size = ceil(n_elements / load_factor);
+  // Round up to next power of 2 for fast modulo using bitwise AND
+  size_t pow2 = 1;
+  while (pow2 < min_size) {
+    if (pow2 > SIZE_MAX / 2)
+      internal_error(__func__, "size %zu would overflow size_t", min_size); // # nocov
+    pow2 *= 2;
+  }
+  return pow2;
 }
 
 static hashtab * hash_create_(size_t n, double load_factor) {
@@ -83,19 +91,20 @@ void hash_rehash(hashtab *h) {
 }
 
 void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
-  size_t max_relocations = h->size; 
+  size_t max_relocations = h->size;
+  size_t mask = h->size - 1;
   struct hash_pair item = { .key = key, .value = value };
   for (size_t i = 0; i < max_relocations; ++i) {
-    size_t idx1 = hash_index1(item.key, h->multiplier1) % h->size;
+    size_t idx1 = hash_index1(item.key, h->multiplier1) & mask;
     if (!h->tb1[idx1].key) {
       h->tb1[idx1] = item;
       return;
     }
     struct hash_pair temp = h->tb1[idx1];
     h->tb1[idx1] = item;
-    item = temp;    
-    
-    size_t idx2 = hash_index2(item.key, h->multiplier2) % h->size;
+    item = temp;
+
+    size_t idx2 = hash_index2(item.key, h->multiplier2) & mask;
     if (!h->tb2[idx2].key) {
       h->tb2[idx2] = item;
       return;
@@ -110,10 +119,11 @@ void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
 }
 
 R_xlen_t hash_lookup(const hashtab *h, SEXP key, R_xlen_t ifnotfound) {
-  size_t idx1 = hash_index1(key, h->multiplier1) % h->size;
+  size_t mask = h->size - 1;
+  size_t idx1 = hash_index1(key, h->multiplier1) & mask;
   if (h->tb1[idx1].key == key) return h->tb1[idx1].value;
-    
-  size_t idx2 = hash_index2(key, h->multiplier2) % h->size;
+
+  size_t idx2 = hash_index2(key, h->multiplier2) & mask;
   if (h->tb2[idx2].key == key) return h->tb2[idx2].value;
   // Should be impossible with a load factor below 1, but just in case:
   return ifnotfound; // # nocov
@@ -172,11 +182,13 @@ static void dhash_enlarge(dhashtab_ * self) {
   if (self->size > SIZE_MAX / 2)
     internal_error(__func__, "doubling %zu elements would overflow size_t", self->size); // # nocov
   size_t new_size = self->size * 2;
+  size_t new_mask = new_size - 1;
   struct hash_pair * new = dhash_allocate(new_size);
   uintptr_t new_multiplier = new_size * hash_multiplier1;
   for (size_t i = 0; i < self->size; ++i) {
+    if (!self->table[i].key) continue;
     for (size_t j = 0; j < new_size; ++j) {
-      size_t ii = (hash_index1(self->table[i].key, new_multiplier) + j) % new_size;
+      size_t ii = (hash_index1(self->table[i].key, new_multiplier) + j) & new_mask;
       if (!new[ii].key) {
         new[ii] = (struct hash_pair){
           .key = self->table[i].key,
@@ -209,7 +221,7 @@ void dhash_set(dhashtab * h, SEXP key, R_xlen_t value) {
   dhashtab_ * self = (dhashtab_ *)h;
   struct hash_pair *cell, *end;
 again:
-  cell = self->table + hash_index1(key, self->multiplier) % self->size;
+  cell = self->table + (hash_index1(key, self->multiplier) & (self->size - 1));
   end = self->table + self->size - 1;
   for (size_t i = 0; i < self->size; ++i, cell = cell == end ? self->table : cell+1) {
     if (cell->key == key) {
@@ -235,7 +247,7 @@ R_xlen_t dhash_lookup(dhashtab * h, SEXP key, R_xlen_t ifnotfound) {
   #pragma omp flush // no locking or atomic access! this is bad
   dhashtab_ self = *(dhashtab_ *)h;
   R_xlen_t ret = ifnotfound;
-  const struct hash_pair * cell = self.table + hash_index1(key, self.multiplier) % self.size;
+  const struct hash_pair * cell = self.table + (hash_index1(key, self.multiplier) & (self.size - 1));
   const struct hash_pair * end = self.table + self.size - 1;
   for (size_t i = 0; i < self.size; ++i, cell = cell == end ? self.table : cell+1) {
     if (cell->key == key) {

From e169d2c4f1fdce613f8602967a4d7e48a98e2d10 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Thu, 6 Nov 2025 12:48:29 +0100
Subject: [PATCH 08/12] mix instead of multiplication

---
 src/hash.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/hash.c b/src/hash.c
index f4aa0e7907..6f4d1e22af 100644
--- a/src/hash.c
+++ b/src/hash.c
@@ -68,15 +68,19 @@ hashtab * hash_create(size_t n) { return hash_create_(n, .5); }
 
 // Hashing for an open addressing hash table. See Cormen et al., Introduction to Algorithms, 3rd ed., section 11.4.
 // This is far from perfect. Make size a prime or a power of two and you'll be able to use double hashing.
-static R_INLINE size_t hash_index1(SEXP key, uintptr_t multiplier) {
+static R_INLINE size_t hash_index1(SEXP key, size_t mask) {
   // The 4 lowest bits of the pointer are probably zeroes because a typical SEXPREC exceeds 16 bytes in size.
   // Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees,
   // which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes.
-  return ((((uintptr_t)key) >> 4) & 0x0fffffff) * multiplier;
+  uintptr_t h = (uintptr_t)key >> 4;
+  return h & mask;
 }
 
-static R_INLINE size_t hash_index2(SEXP key, uintptr_t multiplier) {
-    return ((((uintptr_t)key) >> 6) & 0x0fffffff) * multiplier;
+static R_INLINE size_t hash_index2(SEXP key, size_t mask) {
+  // Use XOR folding to mix up the bits
+  uintptr_t h = (uintptr_t)key >> 4;
+  h ^= h >> 10;
+  return h & mask;
 }
 
 void hash_rehash(hashtab *h) {
@@ -95,7 +99,7 @@ void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
   size_t mask = h->size - 1;
   struct hash_pair item = { .key = key, .value = value };
   for (size_t i = 0; i < max_relocations; ++i) {
-    size_t idx1 = hash_index1(item.key, h->multiplier1) & mask;
+    size_t idx1 = hash_index1(item.key, mask);
     if (!h->tb1[idx1].key) {
       h->tb1[idx1] = item;
       return;
@@ -104,7 +108,7 @@ void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
     h->tb1[idx1] = item;
     item = temp;
 
-    size_t idx2 = hash_index2(item.key, h->multiplier2) & mask;
+    size_t idx2 = hash_index2(item.key, mask);
     if (!h->tb2[idx2].key) {
       h->tb2[idx2] = item;
       return;
@@ -120,10 +124,10 @@ void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
 
 R_xlen_t hash_lookup(const hashtab *h, SEXP key, R_xlen_t ifnotfound) {
   size_t mask = h->size - 1;
-  size_t idx1 = hash_index1(key, h->multiplier1) & mask;
+  size_t idx1 = hash_index1(key, mask);
   if (h->tb1[idx1].key == key) return h->tb1[idx1].value;
 
-  size_t idx2 = hash_index2(key, h->multiplier2) & mask;
+  size_t idx2 = hash_index2(key, mask);
   if (h->tb2[idx2].key == key) return h->tb2[idx2].value;
   // Should be impossible with a load factor below 1, but just in case:
   return ifnotfound; // # nocov
@@ -188,7 +192,7 @@ static void dhash_enlarge(dhashtab_ * self) {
   for (size_t i = 0; i < self->size; ++i) {
     if (!self->table[i].key) continue;
     for (size_t j = 0; j < new_size; ++j) {
-      size_t ii = (hash_index1(self->table[i].key, new_multiplier) + j) & new_mask;
+      size_t ii = (hash_index1(self->table[i].key, new_mask) + j) & new_mask;
       if (!new[ii].key) {
         new[ii] = (struct hash_pair){
           .key = self->table[i].key,
@@ -221,7 +225,7 @@ void dhash_set(dhashtab * h, SEXP key, R_xlen_t value) {
   dhashtab_ * self = (dhashtab_ *)h;
   struct hash_pair *cell, *end;
 again:
-  cell = self->table + (hash_index1(key, self->multiplier) & (self->size - 1));
+  cell = self->table + hash_index1(key, self->size - 1);
   end = self->table + self->size - 1;
   for (size_t i = 0; i < self->size; ++i, cell = cell == end ? self->table : cell+1) {
     if (cell->key == key) {
@@ -247,7 +251,7 @@ R_xlen_t dhash_lookup(dhashtab * h, SEXP key, R_xlen_t ifnotfound) {
   #pragma omp flush // no locking or atomic access! this is bad
   dhashtab_ self = *(dhashtab_ *)h;
   R_xlen_t ret = ifnotfound;
-  const struct hash_pair * cell = self.table + (hash_index1(key, self.multiplier) & (self.size - 1));
+  const struct hash_pair * cell = self.table + hash_index1(key, self.size - 1);
   const struct hash_pair * end = self.table + self.size - 1;
   for (size_t i = 0; i < self.size; ++i, cell = cell == end ? self.table : cell+1) {
     if (cell->key == key) {

From b017013da3af9f8a494eb151e6b4a67621c4fa53 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Thu, 6 Nov 2025 13:07:27 +0100
Subject: [PATCH 09/12] use different mixes

---
 src/hash.c | 53 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/src/hash.c b/src/hash.c
index 6f4d1e22af..ee4a2d16ca 100644
--- a/src/hash.c
+++ b/src/hash.c
@@ -12,9 +12,15 @@ struct hash_tab {
   struct hash_pair *tb1, *tb2;
 };
 
-// TAOCP vol. 3, section 6.4: for multiplication hashing, use A ~ 1/phi, the golden ratio.
-static const double hash_multiplier1 = 0.618033988749895;
-static const double hash_multiplier2 = 0.316227766016838;
+// Fast integer hash multipliers based on golden ratio and other constants
+// 0x9e3779b9 is 2^32 * phi (golden ratio) for 32-bit mixing
+#if SIZE_MAX == UINT64_MAX
+  static const uintptr_t hash_multiplier1 = 0x9e3779b97f4a7c15ULL;
+  static const uintptr_t hash_multiplier2 = 0x85ebca77c2b2ae35ULL;
+#else
+  static const uintptr_t hash_multiplier1 = 0x9e3779b9U;
+  static const uintptr_t hash_multiplier2 = 0x85ebca77U;
+#endif
 
 static R_INLINE size_t get_full_size(size_t n_elements, double load_factor) {
   if (load_factor <= 0 || load_factor >= 1)
@@ -51,7 +57,7 @@ static hashtab * hash_create_(size_t n, double load_factor) {
   hashtab *ret = (hashtab *)R_alloc(sizeof(hashtab), 1);
   ret->size = n_full;
   ret->free = n;
-  // To compute floor(size * (A * key % 1)) in integer arithmetic with A < 1, use ((size * A) * key) % size.
+  // Multiply by size to get different hash functions when rehashing
   ret->multiplier1 = n_full * hash_multiplier1;
   ret->multiplier2 = n_full * hash_multiplier2;
   ret->tb1 = (struct hash_pair *)R_alloc(sizeof(struct hash_pair[n_full]), 1);
@@ -66,21 +72,22 @@ static hashtab * hash_create_(size_t n, double load_factor) {
 
 hashtab * hash_create(size_t n) { return hash_create_(n, .5); }
 
-// Hashing for an open addressing hash table. See Cormen et al., Introduction to Algorithms, 3rd ed., section 11.4.
-// This is far from perfect. Make size a prime or a power of two and you'll be able to use double hashing.
-static R_INLINE size_t hash_index1(SEXP key, size_t mask) {
-  // The 4 lowest bits of the pointer are probably zeroes because a typical SEXPREC exceeds 16 bytes in size.
-  // Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees,
-  // which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes.
+// Fast hash mixing using XOR-shift and integer multiplication
+static R_INLINE size_t hash_index1(SEXP key, uintptr_t multiplier) {
   uintptr_t h = (uintptr_t)key >> 4;
-  return h & mask;
+  // XOR folding to mix high bits into low bits
+  h ^= h >> 16;
+  h *= multiplier;
+  h ^= h >> 13;
+  return h;
 }
 
-static R_INLINE size_t hash_index2(SEXP key, size_t mask) {
-  // Use XOR folding to mix up the bits
-  uintptr_t h = (uintptr_t)key >> 4;
-  h ^= h >> 10;
-  return h & mask;
+static R_INLINE size_t hash_index2(SEXP key, uintptr_t multiplier) {
+  uintptr_t h = (uintptr_t)key >> 6;
+  h ^= h >> 18;
+  h *= multiplier;
+  h ^= h >> 15;
+  return h;
 }
 
 void hash_rehash(hashtab *h) {
@@ -99,7 +106,7 @@ void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
   size_t mask = h->size - 1;
   struct hash_pair item = { .key = key, .value = value };
   for (size_t i = 0; i < max_relocations; ++i) {
-    size_t idx1 = hash_index1(item.key, mask);
+    size_t idx1 = hash_index1(item.key, h->multiplier1) & mask;
     if (!h->tb1[idx1].key) {
       h->tb1[idx1] = item;
       return;
@@ -108,7 +115,7 @@ void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
     h->tb1[idx1] = item;
     item = temp;
 
-    size_t idx2 = hash_index2(item.key, mask);
+    size_t idx2 = hash_index2(item.key, h->multiplier2) & mask;
     if (!h->tb2[idx2].key) {
       h->tb2[idx2] = item;
       return;
@@ -124,10 +131,10 @@ void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
 
 R_xlen_t hash_lookup(const hashtab *h, SEXP key, R_xlen_t ifnotfound) {
   size_t mask = h->size - 1;
-  size_t idx1 = hash_index1(key, mask);
+  size_t idx1 = hash_index1(key, h->multiplier1) & mask;
   if (h->tb1[idx1].key == key) return h->tb1[idx1].value;
 
-  size_t idx2 = hash_index2(key, mask);
+  size_t idx2 = hash_index2(key, h->multiplier2) & mask;
   if (h->tb2[idx2].key == key) return h->tb2[idx2].value;
   // Should be impossible with a load factor below 1, but just in case:
   return ifnotfound; // # nocov
@@ -192,7 +199,7 @@ static void dhash_enlarge(dhashtab_ * self) {
   for (size_t i = 0; i < self->size; ++i) {
     if (!self->table[i].key) continue;
     for (size_t j = 0; j < new_size; ++j) {
-      size_t ii = (hash_index1(self->table[i].key, new_mask) + j) & new_mask;
+      size_t ii = (hash_index1(self->table[i].key, new_multiplier) + j) & new_mask;
       if (!new[ii].key) {
         new[ii] = (struct hash_pair){
           .key = self->table[i].key,
@@ -225,7 +232,7 @@ void dhash_set(dhashtab * h, SEXP key, R_xlen_t value) {
   dhashtab_ * self = (dhashtab_ *)h;
   struct hash_pair *cell, *end;
 again:
-  cell = self->table + hash_index1(key, self->size - 1);
+  cell = self->table + (hash_index1(key, self->multiplier) & (self->size - 1));
   end = self->table + self->size - 1;
   for (size_t i = 0; i < self->size; ++i, cell = cell == end ? self->table : cell+1) {
     if (cell->key == key) {
@@ -251,7 +258,7 @@ R_xlen_t dhash_lookup(dhashtab * h, SEXP key, R_xlen_t ifnotfound) {
   #pragma omp flush // no locking or atomic access! this is bad
   dhashtab_ self = *(dhashtab_ *)h;
   R_xlen_t ret = ifnotfound;
-  const struct hash_pair * cell = self.table + hash_index1(key, self.size - 1);
+  const struct hash_pair * cell = self.table + (hash_index1(key, self.multiplier) & (self.size - 1));
   const struct hash_pair * end = self.table + self.size - 1;
   for (size_t i = 0; i < self.size; ++i, cell = cell == end ? self.table : cell+1) {
     if (cell->key == key) {

From 5a474e0eb4b22a825b3416934793b73be9433a18 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Thu, 6 Nov 2025 13:20:54 +0100
Subject: [PATCH 10/12] change multipliers

---
 src/hash.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/hash.c b/src/hash.c
index ee4a2d16ca..fff465c816 100644
--- a/src/hash.c
+++ b/src/hash.c
@@ -12,15 +12,10 @@ struct hash_tab {
   struct hash_pair *tb1, *tb2;
 };
 
-// Fast integer hash multipliers based on golden ratio and other constants
-// 0x9e3779b9 is 2^32 * phi (golden ratio) for 32-bit mixing
-#if SIZE_MAX == UINT64_MAX
-  static const uintptr_t hash_multiplier1 = 0x9e3779b97f4a7c15ULL;
-  static const uintptr_t hash_multiplier2 = 0x85ebca77c2b2ae35ULL;
-#else
-  static const uintptr_t hash_multiplier1 = 0x9e3779b9U;
-  static const uintptr_t hash_multiplier2 = 0x85ebca77U;
-#endif
+// TAOCP vol. 3, section 6.4: for multiplication hashing, use A ~ 1/phi, the golden ratio.
+// 
+static const double hash_multiplier1 = 0.618033988749895;
+static const double hash_multiplier2 = 0.316227766016838;
 
 static R_INLINE size_t get_full_size(size_t n_elements, double load_factor) {
   if (load_factor <= 0 || load_factor >= 1)

From 1d88ad4c9dd179fb815266fb3acb687a89b8b7c7 Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Thu, 6 Nov 2025 13:35:20 +0100
Subject: [PATCH 11/12] use double hashing

---
 src/chmatch.c |  2 ++
 src/hash.c    | 61 ++++++++++++++++++++++++++++-----------------------
 2 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/src/chmatch.c b/src/chmatch.c
index ac3851b1f1..dd474853fc 100644
--- a/src/chmatch.c
+++ b/src/chmatch.c
@@ -101,10 +101,12 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
     free(counts);
     free(map);
   } else if (chin) {
+    #pragma omp parallel for if(xlen > 100000) schedule(static) num_threads(getDTthreads(xlen, false))
     for (int i=0; i<xlen; i++) {
       ansd[i] = hash_lookup(marks,xd[i],0)<0;
     }
   } else {
+    #pragma omp parallel for if(xlen > 100000) schedule(static) num_threads(getDTthreads(xlen, false))
     for (int i=0; i<xlen; i++) {
       const int m = hash_lookup(marks,xd[i],0);
       ansd[i] = (m<0) ? -m : nomatch;
diff --git a/src/hash.c b/src/hash.c
index fff465c816..ccd29358c7 100644
--- a/src/hash.c
+++ b/src/hash.c
@@ -9,7 +9,7 @@ struct hash_pair {
 struct hash_tab {
   size_t size, free;
   uintptr_t multiplier1, multiplier2;
-  struct hash_pair *tb1, *tb2;
+  struct hash_pair *table;  // Single table for double hashing
 };
 
 // TAOCP vol. 3, section 6.4: for multiplication hashing, use A ~ 1/phi, the golden ratio.
@@ -55,12 +55,10 @@ static hashtab * hash_create_(size_t n, double load_factor) {
   // Multiply by size to get different hash functions when rehashing
   ret->multiplier1 = n_full * hash_multiplier1;
   ret->multiplier2 = n_full * hash_multiplier2;
-  ret->tb1 = (struct hash_pair *)R_alloc(sizeof(struct hash_pair[n_full]), 1);
-  ret->tb2 = (struct hash_pair *)R_alloc(sizeof(struct hash_pair[n_full]), 1);
+  ret->table = (struct hash_pair *)R_alloc(sizeof(struct hash_pair[n_full]), 1);
   // No valid SEXP is a null pointer, so it's a safe marker for empty cells.
   for (size_t i = 0; i < n_full; ++i) {
-    ret->tb1[i].key = NULL;
-    ret->tb2[i].key = NULL;
+    ret->table[i].key = NULL;
   }
   return ret;
 }
@@ -90,35 +88,36 @@ void hash_rehash(hashtab *h) {
   hashtab *new_h = hash_create_(new_size, 0.5);
 
   for (size_t i = 0; i < h->size; ++i) {
-    if (h->tb1[i].key) hash_set(new_h, h->tb1[i].key, h->tb1[i].value);
-    if (h->tb2[i].key) hash_set(new_h, h->tb2[i].key, h->tb2[i].value);
+    if (h->table[i].key) hash_set(new_h, h->table[i].key, h->table[i].value);
   }
-    *h = *new_h;
+  *h = *new_h;
 }
 
 void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
-  size_t max_relocations = h->size;
   size_t mask = h->size - 1;
-  struct hash_pair item = { .key = key, .value = value };
-  for (size_t i = 0; i < max_relocations; ++i) {
-    size_t idx1 = hash_index1(item.key, h->multiplier1) & mask;
-    if (!h->tb1[idx1].key) {
-      h->tb1[idx1] = item;
+  size_t h1 = hash_index1(key, h->multiplier1) & mask;
+  size_t h2 = hash_index2(key, h->multiplier2) & mask;
+
+  if (h2 == 0) h2 = 1;
+  else if ((h2 & 1) == 0) h2 |= 1;
+
+  for (size_t i = 0; i < h->size; ++i) {
+    size_t idx = (h1 + i * h2) & mask;
+
+    if (!h->table[idx].key) {
+      // Empty slot found
+      h->table[idx].key = key;
+      h->table[idx].value = value;
+      h->free--;
       return;
     }
-    struct hash_pair temp = h->tb1[idx1];
-    h->tb1[idx1] = item;
-    item = temp;
 
-    size_t idx2 = hash_index2(item.key, h->multiplier2) & mask;
-    if (!h->tb2[idx2].key) {
-      h->tb2[idx2] = item;
+    if (h->table[idx].key == key) {
+      h->table[idx].value = value;
       return;
     }
-    temp = h->tb2[idx2];
-    h->tb2[idx2] = item;
-    item = temp;
   }
+
   // need to rehash
   hash_rehash(h);
   hash_set(h, key, value);
@@ -126,12 +125,18 @@ void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
 
 R_xlen_t hash_lookup(const hashtab *h, SEXP key, R_xlen_t ifnotfound) {
   size_t mask = h->size - 1;
-  size_t idx1 = hash_index1(key, h->multiplier1) & mask;
-  if (h->tb1[idx1].key == key) return h->tb1[idx1].value;
+  size_t h1 = hash_index1(key, h->multiplier1) & mask;
+  size_t h2 = hash_index2(key, h->multiplier2) & mask;
+
+  if (h2 == 0) h2 = 1;
+  else if ((h2 & 1) == 0) h2 |= 1;
+
+  for (size_t i = 0; i < h->size; ++i) {
+    size_t idx = (h1 + i * h2) & mask;
+    if (!h->table[idx].key) return ifnotfound;
+    if (h->table[idx].key == key) return h->table[idx].value;
+  }
 
-  size_t idx2 = hash_index2(key, h->multiplier2) & mask;
-  if (h->tb2[idx2].key == key) return h->tb2[idx2].value;
-  // Should be impossible with a load factor below 1, but just in case:
   return ifnotfound; // # nocov
 }
 

From 48b19422dd331a9d7eaddff507563bcf9643ebdd Mon Sep 17 00:00:00 2001
From: Benjamin Schwendinger <benjaminschwe@gmail.com>
Date: Thu, 6 Nov 2025 15:52:05 +0100
Subject: [PATCH 12/12] remove xor folding

---
 src/chmatch.c |  4 ++--
 src/hash.c    | 23 +++++++++++------------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/chmatch.c b/src/chmatch.c
index dd474853fc..34c50c22b3 100644
--- a/src/chmatch.c
+++ b/src/chmatch.c
@@ -101,12 +101,12 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
     free(counts);
     free(map);
   } else if (chin) {
-    #pragma omp parallel for if(xlen > 100000) schedule(static) num_threads(getDTthreads(xlen, false))
+    #pragma omp parallel for num_threads(getDTthreads(xlen, true))
     for (int i=0; i<xlen; i++) {
       ansd[i] = hash_lookup(marks,xd[i],0)<0;
     }
   } else {
-    #pragma omp parallel for if(xlen > 100000) schedule(static) num_threads(getDTthreads(xlen, false))
+    #pragma omp parallel for num_threads(getDTthreads(xlen, true))
     for (int i=0; i<xlen; i++) {
       const int m = hash_lookup(marks,xd[i],0);
       ansd[i] = (m<0) ? -m : nomatch;
diff --git a/src/hash.c b/src/hash.c
index ccd29358c7..9a5296fcb4 100644
--- a/src/hash.c
+++ b/src/hash.c
@@ -65,22 +65,21 @@ static hashtab * hash_create_(size_t n, double load_factor) {
 
 hashtab * hash_create(size_t n) { return hash_create_(n, .5); }
 
-// Fast hash mixing using XOR-shift and integer multiplication
+// double hashing
 static R_INLINE size_t hash_index1(SEXP key, uintptr_t multiplier) {
-  uintptr_t h = (uintptr_t)key >> 4;
-  // XOR folding to mix high bits into low bits
-  h ^= h >> 16;
-  h *= multiplier;
-  h ^= h >> 13;
-  return h;
+  // The 4 lowest bits of the pointer are probably zeroes because a typical SEXPREC exceeds 16 bytes in size.
+  // Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees,
+  // which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes.
+  return ((((uintptr_t)key) >> 4) & 0x0fffffff) * multiplier;
 }
 
 static R_INLINE size_t hash_index2(SEXP key, uintptr_t multiplier) {
-  uintptr_t h = (uintptr_t)key >> 6;
-  h ^= h >> 18;
-  h *= multiplier;
-  h ^= h >> 15;
-  return h;
+  // For double hashing, we need a different hash that's coprime with table size.
+  // We use higher-order bits that hash_index1 mostly ignores, and ensure
+  // the result is always odd (coprime with power-of-2 table sizes).
+  uintptr_t ptr = (uintptr_t)key;
+  ptr = (ptr >> 12) | (ptr << (sizeof(uintptr_t) * 8 - 12));
+  return ((ptr & 0x0fffffff) * multiplier) | 1;
 }
 
 void hash_rehash(hashtab *h) {