Skip to content

Conversation

@ben-schwen
Copy link
Member

@ben-schwen ben-schwen commented Nov 6, 2025

So I implemented double hashing and adjusted the table size to the power of 2 (both suggestions from Ivan in #6694)

knuth is Ivans linear probing with knuth hash
double is the first try of double hashing where some mixing of XOR happens to spare multiplications
noxor is an easier variation without mixing
khash is using the khash.h from klib

What really made (double and noxor) competitive with master and khash was probably parallelizing the probing in chmatch after building the hashtable (khash also uses this). What is left is to benchmark chmatchdup which is probably slower and either needs adjustment of the hash table to also save a list of elements or at least cache the 3x lookups.

image image image
library(atime)
library(data.table)

pkg.path <- '.'
limit <- 1
# taken from .ci/atime/tests.R
pkg.edit.fun <- function(old.Package, new.Package, sha, new.pkg.path) {
  pkg_find_replace <- function(glob, FIND, REPLACE) {
    atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
  }
  Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE)
  Package_ <- gsub(".", "_", old.Package, fixed = TRUE)
  new.Package_ <- paste0(Package_, "_", sha)
  pkg_find_replace(
    "DESCRIPTION",
    paste0("Package:\\s+", old.Package),
    paste("Package:", new.Package))
  pkg_find_replace(
    file.path("src", "Makevars.*in"),
    Package_regex,
    new.Package_)
  pkg_find_replace(
    file.path("R", "onLoad.R"),
    Package_regex,
    new.Package_)
  pkg_find_replace(
    file.path("R", "onLoad.R"),
    sprintf('packageVersion\\("%s"\\)', old.Package),
    sprintf('packageVersion\\("%s"\\)', new.Package))
  pkg_find_replace(
    file.path("src", "init.c"),
    paste0("R_init_", Package_regex),
    paste0("R_init_", gsub("[.]", "_", new.Package_)))
  pkg_find_replace(
    "NAMESPACE",
    sprintf('useDynLib\\("?%s"?', Package_regex),
    paste0('useDynLib(', new.Package_))
}

versions <- c(
  master = '3c90b0f80e1d5b54ea97f3b56f28df07b93e820a',
  knuth = 'd7a9a1707ec94ec4f2bd86a5dfb5609207029ba4',
  double = '1d88ad4c9dd179fb815266fb3acb687a89b8b7c7',
  noxor = '48b19422dd331a9d7eaddff507563bcf9643ebdd',
  khash = '3178f10af7860223b7bea0fba519104d6a9d55ca'
)

set.seed(3)
sample_strings = function(N=10, len=4) {
   do.call(paste0, replicate(len, sample(LETTERS, N, TRUE), FALSE))
}

N <- 10^seq(2, 7.5, .25)

# Benchmark 1: Large table 
tab_full = sample_strings(1e6, 10)
tab_small = sample(tab_full, 9e5)
chmatch_work1 <- lapply(setNames(nm = N), \(N)
  sample(tab_full, N, TRUE)
)

chmatch1 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work1[[as.character(N)]], tab_small),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch1)

# Benchmark 2: Small table (expected case: a few distinct strings - 26 letters)
chmatch_work2 <- lapply(setNames(nm = N), \(N)
  sample(letters, N, TRUE)
)
chmatch2 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work2[[as.character(N)]], letters),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch2)

# Benchmark 3: Medium table (10K unique strings)
medium_table <- sample_strings(10000, len=4)
chmatch_work3 <- lapply(setNames(nm = N), \(N)
  sample(medium_table, N, TRUE)
)
chmatch3 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work3[[as.character(N)]], medium_table),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch3)

# Benchmark 4: High miss rate (most items not found)
miss_table <- sample_strings(100, len=4)
chmatch_work4 <- lapply(setNames(nm = N), \(N)
  sample_strings(N, len=4)
)
chmatch4 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work4[[as.character(N)]], miss_table),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch4)

# Benchmark 5: All hits (every item found)
hit_table <- sample_strings(1000, len=4)
chmatch_work5 <- lapply(setNames(nm = N), \(N)
  sample(hit_table, N, TRUE)
)
chmatch5 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work5[[as.character(N)]], hit_table),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch5)

# Benchmark 6: chin (logical return) with small table
chin_work <- lapply(setNames(nm = N), \(N)
  sample(letters, N, TRUE)
)
chin_bench <- atime_versions(
  pkg.path, N,
  expr = data.table::`%chin%`(chin_work[[as.character(N)]], letters),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chin_bench)

# Benchmark 7: Long strings (16 chars)
long_table <- sample_strings(1000, len=16)
chmatch_work7 <- lapply(setNames(nm = N), \(N)
  sample(long_table, N, TRUE)
)
chmatch7 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work7[[as.character(N)]], long_table),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch7)

# Benchmark 8: Realistic join keys (90% hit rate)
join_table <- sample_strings(10000, len=8)
chmatch_work8 <- lapply(setNames(nm = N), \(N) {
  # 90% from join_table (will be found), 10% new (won't be found)
  c(
    sample(join_table, as.integer(N * 0.9), TRUE),
    sample_strings(as.integer(N * 0.1), len=8)
  )
})
chmatch8 <- atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work8[[as.character(N)]], join_table),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch8)


N <- 10^seq(2, 7.5, .25)
# expected case: a few distinct strings
forderv1_work <- lapply(setNames(nm = N), \(N)
  sample(letters, N, TRUE)
)
forderv1 <- atime_versions(
  pkg.path, N,
  expr = data.table:::forderv(forderv1_work[[as.character(N)]]),
  sha.vec = versions, seconds.limit = limit, verbose = TRUE,
  pkg.edit.fun = pkg.edit.fun
)
plot(forderv1)
rm(forderv1_work); gc(full = TRUE)

# worst case: all strings different
# (a challenge for the allocator too due to many small immovable objects)
N <- 10^seq(2, 7.5, .25)
forderv2_work <- lapply(setNames(nm = N), \(N)
  format(runif(N), digits = 16)
)
forderv2 <- atime_versions(
  pkg.path, N,
  expr = data.table:::forderv(forderv2_work[[as.character(N)]]),
  sha.vec = versions, seconds.limit = limit, verbose = TRUE,
  pkg.edit.fun = pkg.edit.fun
)
plot(forderv2)
rm(forderv2_work); gc(full = TRUE)

# expected case: all columns named the same
N <- 10^seq(1, 5.5, .25) # number of data.tables in the list
k <- 10 # number of columns per data.table
rbindlist1_work <- lapply(setNames(nm = N), \(N)
  rep(list(setNames(as.list(1:k), letters[1:k])), N)
)
rbindlist1 <- atime_versions(
  pkg.path, N,
  expr = data.table::rbindlist(rbindlist1_work[[as.character(N)]]),
  sha.vec = versions, seconds.limit = limit, verbose = TRUE,
  pkg.edit.fun = pkg.edit.fun
)
plot(rbindlist1)
rm(rbindlist1_work); gc(full = TRUE)

# worst case: all columns different
N <- 10^seq(1, 4.5, .25) # number of data.tables in the list
k <- 10 # number of columns per data.table
rbindlist2_work <- lapply(setNames(nm = N), \(N)
  replicate(N, setNames(as.list(1:k), format(runif(k), digits = 16)), FALSE)
)
rbindlist2 <- atime_versions(
  pkg.path, N,
  expr = data.table::rbindlist(rbindlist2_work[[as.character(N)]], fill = TRUE),
  sha.vec = versions, seconds.limit = limit, verbose = TRUE,
  pkg.edit.fun = pkg.edit.fun
)
plot(rbindlist2)
rm(rbindlist2_work); gc(full = TRUE)

@codecov
Copy link

codecov bot commented Nov 6, 2025

Codecov Report

❌ Patch coverage is 83.01887% with 9 lines in your changes missing coverage. Please review.
⚠️ Please upload report for BASE (truehash@c2b5c67). Learn more about missing BASE report.

Files with missing lines Patch % Lines
src/hash.c 82.35% 9 Missing ⚠️
Additional details and impacted files
@@             Coverage Diff             @@
##             truehash    #7418   +/-   ##
===========================================
  Coverage            ?   99.04%           
===========================================
  Files               ?       85           
  Lines               ?    16126           
  Branches            ?        0           
===========================================
  Hits                ?    15972           
  Misses              ?      154           
  Partials            ?        0           

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@tdhock
Copy link
Member

tdhock commented Nov 7, 2025

interesting atime result data! Thanks for sharing.
I tried running the code and got this error

Error in value[[3L]](cond) : 
  Error in revparse_single(object, branch): Error in 'git2r_revparse_single': Requested object could not be found

 when trying to checkout 3178f10af7860223b7bea0fba519104d6a9d55ca

is khash = '3178f10af7860223b7bea0fba519104d6a9d55ca' from a deleted branch?

@tdhock
Copy link
Member

tdhock commented Nov 7, 2025

on my windows machine I get similar results for the first test.
image
I also fit asymptotic reference lines, which suggest that everything is linear (differences are constant factors).
image

library(data.table)
pkg.path <- '~/R/data.table'
limit <- 1
# taken from .ci/atime/tests.R
pkg.edit.fun <- function(old.Package, new.Package, sha, new.pkg.path) {
  pkg_find_replace <- function(glob, FIND, REPLACE) {
    atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
  }
  Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE)
  Package_ <- gsub(".", "_", old.Package, fixed = TRUE)
  new.Package_ <- paste0(Package_, "_", sha)
  pkg_find_replace(
    "DESCRIPTION",
    paste0("Package:\\s+", old.Package),
    paste("Package:", new.Package))
  pkg_find_replace(
    file.path("src", "Makevars.*in"),
    Package_regex,
    new.Package_)
  pkg_find_replace(
    file.path("R", "onLoad.R"),
    Package_regex,
    new.Package_)
  pkg_find_replace(
    file.path("R", "onLoad.R"),
    sprintf('packageVersion\\("%s"\\)', old.Package),
    sprintf('packageVersion\\("%s"\\)', new.Package))
  pkg_find_replace(
    file.path("src", "init.c"),
    paste0("R_init_", Package_regex),
    paste0("R_init_", gsub("[.]", "_", new.Package_)))
  pkg_find_replace(
    "NAMESPACE",
    sprintf('useDynLib\\("?%s"?', Package_regex),
    paste0('useDynLib(', new.Package_))
}

versions <- c(
  master = '3c90b0f80e1d5b54ea97f3b56f28df07b93e820a',
  knuth = 'd7a9a1707ec94ec4f2bd86a5dfb5609207029ba4',
  double = '1d88ad4c9dd179fb815266fb3acb687a89b8b7c7',
  noxor = '48b19422dd331a9d7eaddff507563bcf9643ebdd'
  #khash = '3178f10af7860223b7bea0fba519104d6a9d55ca'
)

set.seed(3)
sample_strings = function(N=10, len=4) {
   do.call(paste0, replicate(len, sample(LETTERS, N, TRUE), FALSE))
}

N <- 10^seq(2, 7.5, .25)

# Benchmark 1: Large table 
tab_full = sample_strings(1e6, 10)
tab_small = sample(tab_full, 9e5)
chmatch_work1 <- lapply(setNames(nm = N), \(N)
  sample(tab_full, N, TRUE)
)

chmatch1 <- atime::atime_versions(
  pkg.path, N,
  expr = data.table::chmatch(chmatch_work1[[as.character(N)]], tab_small),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(chmatch1)
chmatch1.refs <- atime::references_best(chmatch1)
plot(chmatch1.refs)

@tdhock
Copy link
Member

tdhock commented Nov 7, 2025

atime CI failure can probably be fixed by merging master into this branch.

@ben-schwen
Copy link
Member Author

is khash = '3178f10af7860223b7bea0fba519104d6a9d55ca' from a deleted branch?

The khash commit is only locally and not committed since I didn't want to push other licensed files to the repo.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants