use double hashing instead of linear probing #7418

ben-schwen · 2025-11-06T21:25:50Z

So I implemented double hashing and adjusted the table size to the power of 2 (both suggestions from Ivan in #6694)

knuth is Ivans linear probing with knuth hash
double is the first try of double hashing where some mixing of XOR happens to spare multiplications
noxor is an easier variation without mixing
khash is using the khash.h from klib

What really made (double and noxor) competitive with master and khash was probably parallelizing the probing in chmatch after building the hashtable (khash also uses this). What is left is to benchmark chmatchdup which is probably slower and either needs adjustment of the hash table to also save a list of elements or at least cache the 3x lookups.

library(atime)
library(data.table)

pkg.path <- '.'
limit <- 1
# taken from .ci/atime/tests.R
pkg.edit.fun <- function(old.Package, new.Package, sha, new.pkg.path) {
  pkg_find_replace <- function(glob, FIND, REPLACE) {
    atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
  }
  Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE)
  Package_ <- gsub(".", "_", old.Package, fixed = TRUE)
  new.Package_ <- paste0(Package_, "_", sha)
  pkg_find_replace(
    "DESCRIPTION",
    paste0("Package:\\s+", old.Package),
    paste("Package:", new.Package))
  pkg_find_replace(
    file.path("src", "Makevars.*in"),
    Package_regex,
    new.Package_)
  pkg_find_replace(
    file.path("R", "onLoad.R"),
    Package_regex,
    new.Package_)
  pkg_find_replace(
    file.path("R", "onLoad.R"),
    sprintf('packageVersion\\("%s"\\)', old.Package),
    sprintf('packageVersion\\("%s"\\)', new.Package))
  pkg_find_replace(
    file.path("src", "init.c"),
    paste0("R_init_", Package_regex),
    paste0("R_init_", gsub("[.]", "_", new.Package_)))
  pkg_find_replace(
    "NAMESPACE",
    sprintf('useDynLib\\("?%s"?', Package_regex),
    paste0('useDynLib(', new.Package_))
}

versions <- c(
  master = '3c90b0f80e1d5b54ea97f3b56f28df07b93e820a',
  knuth = 'd7a9a1707ec94ec4f2bd86a5dfb5609207029ba4',
  double = '1d88ad4c9dd179fb815266fb3acb687a89b8b7c7',
  noxor = '48b19422dd331a9d7eaddff507563bcf9643ebdd',
  khash = '3178f10af7860223b7bea0fba519104d6a9d55ca'
)

set.seed(3)
sample_strings = function(N=10, len=4) {
   do.call(paste0, replicate(len, sample(LETTERS, N, TRUE), FALSE))
}

N <- 10^seq(2, 7.5, .25)

# Benchmark 1: Large table 
tab_full = sample_strings(1e6, 10)
tab_small = sample(tab_full, 9e5)
chmatch_work1 <- lapply(setNames(nm = N), \(N)
  sample(tab_full, N, TRUE)
)

chmatch1 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work1[[as.character(N)]], tab_small),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch1)

# Benchmark 2: Small table (expected case: a few distinct strings - 26 letters)
chmatch_work2 <- lapply(setNames(nm = N), \(N)
  sample(letters, N, TRUE)
)
chmatch2 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work2[[as.character(N)]], letters),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch2)

# Benchmark 3: Medium table (10K unique strings)
medium_table <- sample_strings(10000, len=4)
chmatch_work3 <- lapply(setNames(nm = N), \(N)
  sample(medium_table, N, TRUE)
)
chmatch3 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work3[[as.character(N)]], medium_table),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch3)

# Benchmark 4: High miss rate (most items not found)
miss_table <- sample_strings(100, len=4)
chmatch_work4 <- lapply(setNames(nm = N), \(N)
  sample_strings(N, len=4)
)
chmatch4 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work4[[as.character(N)]], miss_table),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch4)

# Benchmark 5: All hits (every item found)
hit_table <- sample_strings(1000, len=4)
chmatch_work5 <- lapply(setNames(nm = N), \(N)
  sample(hit_table, N, TRUE)
)
chmatch5 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work5[[as.character(N)]], hit_table),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch5)

# Benchmark 6: chin (logical return) with small table
chin_work <- lapply(setNames(nm = N), \(N)
  sample(letters, N, TRUE)
)
chin_bench <- atime_versions(
  pkg.path, N,
  expr = data.table::`%chin%`(chin_work[[as.character(N)]], letters),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chin_bench)

# Benchmark 7: Long strings (16 chars)
long_table <- sample_strings(1000, len=16)
chmatch_work7 <- lapply(setNames(nm = N), \(N)
  sample(long_table, N, TRUE)
)
chmatch7 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work7[[as.character(N)]], long_table),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch7)

# Benchmark 8: Realistic join keys (90% hit rate)
join_table <- sample_strings(10000, len=8)
chmatch_work8 <- lapply(setNames(nm = N), \(N) {
  # 90% from join_table (will be found), 10% new (won't be found)
  c(
    sample(join_table, as.integer(N * 0.9), TRUE),
    sample_strings(as.integer(N * 0.1), len=8)
  )
})
chmatch8 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work8[[as.character(N)]], join_table),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch8)


N <- 10^seq(2, 7.5, .25)
# expected case: a few distinct strings
forderv1_work <- lapply(setNames(nm = N), \(N)
  sample(letters, N, TRUE)
)
forderv1 <- atime_versions(
  pkg.path, N,
  expr = data.table:::forderv(forderv1_work[[as.character(N)]]),
  sha.vec = versions, seconds.limit = limit, verbose = TRUE,
  pkg.edit.fun = pkg.edit.fun
)
plot(forderv1)
rm(forderv1_work); gc(full = TRUE)

# worst case: all strings different
# (a challenge for the allocator too due to many small immovable objects)
N <- 10^seq(2, 7.5, .25)
forderv2_work <- lapply(setNames(nm = N), \(N)
  format(runif(N), digits = 16)
)
forderv2 <- atime_versions(
  pkg.path, N,
  expr = data.table:::forderv(forderv2_work[[as.character(N)]]),
  sha.vec = versions, seconds.limit = limit, verbose = TRUE,
  pkg.edit.fun = pkg.edit.fun
)
plot(forderv2)
rm(forderv2_work); gc(full = TRUE)

# expected case: all columns named the same
N <- 10^seq(1, 5.5, .25) # number of data.tables in the list
k <- 10 # number of columns per data.table
rbindlist1_work <- lapply(setNames(nm = N), \(N)
  rep(list(setNames(as.list(1:k), letters[1:k])), N)
)
rbindlist1 <- atime_versions(
  pkg.path, N,
  expr = data.table::rbindlist(rbindlist1_work[[as.character(N)]]),
  sha.vec = versions, seconds.limit = limit, verbose = TRUE,
  pkg.edit.fun = pkg.edit.fun
)
plot(rbindlist1)
rm(rbindlist1_work); gc(full = TRUE)

# worst case: all columns different
N <- 10^seq(1, 4.5, .25) # number of data.tables in the list
k <- 10 # number of columns per data.table
rbindlist2_work <- lapply(setNames(nm = N), \(N)
  replicate(N, setNames(as.list(1:k), format(runif(k), digits = 16)), FALSE)
)
rbindlist2 <- atime_versions(
  pkg.path, N,
  expr = data.table::rbindlist(rbindlist2_work[[as.character(N)]], fill = TRUE),
  sha.vec = versions, seconds.limit = limit, verbose = TRUE,
  pkg.edit.fun = pkg.edit.fun
)
plot(rbindlist2)
rm(rbindlist2_work); gc(full = TRUE)

codecov · 2025-11-06T21:35:44Z

Codecov Report

❌ Patch coverage is 83.01887% with 9 lines in your changes missing coverage. Please review.
⚠️ Please upload report for BASE (truehash@c2b5c67). Learn more about missing BASE report.

Files with missing lines	Patch %	Lines
src/hash.c	82.35%	9 Missing ⚠️

Additional details and impacted files

@@             Coverage Diff             @@
##             truehash    #7418   +/-   ##
===========================================
  Coverage            ?   99.04%           
===========================================
  Files               ?       85           
  Lines               ?    16126           
  Branches            ?        0           
===========================================
  Hits                ?    15972           
  Misses              ?      154           
  Partials            ?        0

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:

❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

tdhock · 2025-11-07T17:52:33Z

interesting atime result data! Thanks for sharing.
I tried running the code and got this error

Error in value[[3L]](cond) : 
  Error in revparse_single(object, branch): Error in 'git2r_revparse_single': Requested object could not be found

 when trying to checkout 3178f10af7860223b7bea0fba519104d6a9d55ca

is khash = '3178f10af7860223b7bea0fba519104d6a9d55ca' from a deleted branch?

tdhock · 2025-11-07T17:59:28Z

on my windows machine I get similar results for the first test.

I also fit asymptotic reference lines, which suggest that everything is linear (differences are constant factors).

library(data.table)
pkg.path <- '~/R/data.table'
limit <- 1
# taken from .ci/atime/tests.R
pkg.edit.fun <- function(old.Package, new.Package, sha, new.pkg.path) {
  pkg_find_replace <- function(glob, FIND, REPLACE) {
    atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
  }
  Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE)
  Package_ <- gsub(".", "_", old.Package, fixed = TRUE)
  new.Package_ <- paste0(Package_, "_", sha)
  pkg_find_replace(
    "DESCRIPTION",
    paste0("Package:\\s+", old.Package),
    paste("Package:", new.Package))
  pkg_find_replace(
    file.path("src", "Makevars.*in"),
    Package_regex,
    new.Package_)
  pkg_find_replace(
    file.path("R", "onLoad.R"),
    Package_regex,
    new.Package_)
  pkg_find_replace(
    file.path("R", "onLoad.R"),
    sprintf('packageVersion\\("%s"\\)', old.Package),
    sprintf('packageVersion\\("%s"\\)', new.Package))
  pkg_find_replace(
    file.path("src", "init.c"),
    paste0("R_init_", Package_regex),
    paste0("R_init_", gsub("[.]", "_", new.Package_)))
  pkg_find_replace(
    "NAMESPACE",
    sprintf('useDynLib\\("?%s"?', Package_regex),
    paste0('useDynLib(', new.Package_))
}

versions <- c(
  master = '3c90b0f80e1d5b54ea97f3b56f28df07b93e820a',
  knuth = 'd7a9a1707ec94ec4f2bd86a5dfb5609207029ba4',
  double = '1d88ad4c9dd179fb815266fb3acb687a89b8b7c7',
  noxor = '48b19422dd331a9d7eaddff507563bcf9643ebdd'
  #khash = '3178f10af7860223b7bea0fba519104d6a9d55ca'
)

set.seed(3)
sample_strings = function(N=10, len=4) {
   do.call(paste0, replicate(len, sample(LETTERS, N, TRUE), FALSE))
}

N <- 10^seq(2, 7.5, .25)

# Benchmark 1: Large table 
tab_full = sample_strings(1e6, 10)
tab_small = sample(tab_full, 9e5)
chmatch_work1 <- lapply(setNames(nm = N), \(N)
  sample(tab_full, N, TRUE)
)

chmatch1 <- atime::atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work1[[as.character(N)]], tab_small),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch1)
chmatch1.refs <- atime::references_best(chmatch1)
plot(chmatch1.refs)

tdhock · 2025-11-07T18:03:17Z

atime CI failure can probably be fixed by merging master into this branch.

ben-schwen · 2025-11-08T11:29:38Z

is khash = '3178f10af7860223b7bea0fba519104d6a9d55ca' from a deleted branch?

The khash commit is only locally and not committed since I didn't want to push other licensed files to the repo.

ben-schwen added 12 commits March 23, 2025 19:16

add lookup or insert

507ca36

use lookup or insert

47319c3

use lookup_or_insert

b7d126c

really use lookup or insert

337a0c2

use cuckoo hashing

acdb899

add rehash

09b3725

use power of 2 and mask instead of modulo

7d4b067

mix instead of multiplication

e169d2c

use different mixes

b017013

change multipliers

5a474e0

use double hashing

1d88ad4

remove xor folding

48b1942

ben-schwen requested review from MichaelChirico and aitap as code owners November 6, 2025 21:25

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

use double hashing instead of linear probing #7418

use double hashing instead of linear probing #7418

ben-schwen commented Nov 6, 2025 •

edited

Loading

Uh oh!

codecov bot commented Nov 6, 2025

Uh oh!

tdhock commented Nov 7, 2025

Uh oh!

tdhock commented Nov 7, 2025

Uh oh!

tdhock commented Nov 7, 2025

Uh oh!

ben-schwen commented Nov 8, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

use double hashing instead of linear probing #7418

Are you sure you want to change the base?

use double hashing instead of linear probing #7418

Conversation

ben-schwen commented Nov 6, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

codecov bot commented Nov 6, 2025

Codecov Report

Uh oh!

tdhock commented Nov 7, 2025

Uh oh!

tdhock commented Nov 7, 2025

Uh oh!

tdhock commented Nov 7, 2025

Uh oh!

ben-schwen commented Nov 8, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

ben-schwen commented Nov 6, 2025 •

edited

Loading