diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c7fc43f..d4bd33a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -17,8 +17,8 @@ jobs: strategy: matrix: rust: - - "1.60.0" - - "1.65.0" + - "1.66.0" + - "1.76.0" steps: - uses: dtolnay/rust-toolchain@master with: diff --git a/Cargo.toml b/Cargo.toml index 2a604da..379f9a1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,3 +30,7 @@ harness = false [features] default = ["parallel"] parallel = ["rayon", "crossbeam-utils"] + +[profile.release] +lto = true +debug = 2 diff --git a/benches/build.rs b/benches/build.rs index 6e0da04..5daba91 100644 --- a/benches/build.rs +++ b/benches/build.rs @@ -4,34 +4,157 @@ extern crate bencher; use bencher::Bencher; -use boomphf::Mphf; +use boomphf::{ExternallyHashed, Mphf}; -fn build1_ser(bench: &mut Bencher) { +fn build1_ser_u64(bench: &mut Bencher) { + let items: Vec = (0..1000000u64).map(|x| x * 2).collect(); + bench.iter(|| { + std::hint::black_box(Mphf::new(2.0, &items)); + }); +} + +fn build1_ser_externally_hashed(bench: &mut Bencher) { + let items: Vec = (0..1000000u64) + .map(|x| ExternallyHashed(wyhash::wyrng(&mut (x * 2)))) + .collect(); + bench.iter(|| { + std::hint::black_box(Mphf::new(2.0, &items)); + }); +} + +fn build1_ser_slices(bench: &mut Bencher) { + let items: Vec<[u8; 8]> = (0..1000000u64).map(|x| (x * 2).to_le_bytes()).collect(); + bench.iter(|| { + std::hint::black_box(Mphf::new(2.0, &items)); + }); +} + +fn build1_ser_long_slices(bench: &mut Bencher) { + let items = (0..1000000u64) + .map(|x| { + let mut long_key = [0u8; 128]; + long_key[0..8].copy_from_slice(&(x * 2).to_le_bytes()); + long_key + }) + .collect::>(); bench.iter(|| { - let items: Vec = (0..1000000u64).map(|x| x * 2).collect(); - let _ = Mphf::new(2.0, &items); + std::hint::black_box(Mphf::new(2.0, &items)); + }); +} + +fn build1_ser_long_slices_externally_hashed(bench: &mut Bencher) { + let items = (0..1000000u64) + .map(|x| { + let mut long_key = [0u8; 128]; + long_key[0..8].copy_from_slice(&(x * 2).to_le_bytes()); + ExternallyHashed(wyhash::wyhash(&long_key, 0)) + }) + .collect::>(); + bench.iter(|| { + std::hint::black_box(Mphf::new(2.0, &items)); }); } #[allow(dead_code)] -fn build1_par(bench: &mut Bencher) { +fn build1_par_u64(bench: &mut Bencher) { + let items: Vec = (0..1000000u64).map(|x| x * 2).collect(); #[cfg(feature = "parallel")] bench.iter(|| { - let items: Vec = (0..1000000u64).map(|x| x * 2).collect(); - let _ = Mphf::new_parallel(2.0, &items, None); + std::hint::black_box(Mphf::new_parallel(2.0, &items, None)); }); } -fn scan1_ser(bench: &mut Bencher) { +#[allow(dead_code)] +fn build1_par_slices(bench: &mut Bencher) { + let items: Vec<[u8; 8]> = (0..1000000u64).map(|x| (x * 2).to_le_bytes()).collect(); + #[cfg(feature = "parallel")] + bench.iter(|| { + std::hint::black_box(Mphf::new_parallel(2.0, &items, None)); + }); +} + +fn scan1_ser_u64(bench: &mut Bencher) { let items: Vec = (0..1000000u64).map(|x| x * 2).collect(); let phf = Mphf::new(2.0, &items); bench.iter(|| { - for i in (0..1000000u64).map(|x| x * 2) { - phf.hash(&i); + for i in &items { + std::hint::black_box(phf.hash(&i)); + } + }); +} + +fn scan1_ser_slice(bench: &mut Bencher) { + let items: Vec<[u8; 8]> = (0..1000000u64).map(|x| (x * 2).to_le_bytes()).collect(); + let phf = Mphf::new(2.0, &items); + + bench.iter(|| { + for i in &items { + std::hint::black_box(phf.hash(i)); + } + }); +} + +fn scan1_ser_externally_hashed(bench: &mut Bencher) { + let items: Vec = (0..1000000u64) + .map(|x| ExternallyHashed(wyhash::wyrng(&mut (x * 2)))) + .collect(); + let phf = Mphf::new(2.0, &items); + + bench.iter(|| { + for i in &items { + std::hint::black_box(phf.hash(i)); + } + }); +} + +fn scan1_ser_long_key(bench: &mut Bencher) { + let items = (0..1000000u64) + .map(|x| { + let mut long_key = [0u8; 128]; + long_key[0..8].copy_from_slice(&(x * 2).to_le_bytes()); + long_key + }) + .collect::>(); + let phf = Mphf::new(2.0, &items); + + bench.iter(|| { + for i in &items { + std::hint::black_box(phf.hash(i)); + } + }); +} + +fn scan1_ser_long_key_externally_hashed(bench: &mut Bencher) { + let items: Vec = (0..1000000u64) + .map(|x| { + let mut long_key = [0u8; 128]; + long_key[0..8].copy_from_slice(&(x * 2).to_le_bytes()); + ExternallyHashed(wyhash::wyhash(&long_key, 0)) + }) + .collect(); + let phf = Mphf::new(2.0, &items); + + bench.iter(|| { + for i in &items { + std::hint::black_box(phf.hash(i)); } }); } -benchmark_group!(benches, build1_ser, build1_par, scan1_ser); +benchmark_group!( + benches, + build1_ser_externally_hashed, + build1_ser_u64, + build1_ser_slices, + build1_ser_long_slices, + build1_ser_long_slices_externally_hashed, + build1_par_u64, + build1_par_slices, + scan1_ser_u64, + scan1_ser_slice, + scan1_ser_externally_hashed, + scan1_ser_long_key, + scan1_ser_long_key_externally_hashed +); benchmark_main!(benches); diff --git a/src/bitvector.rs b/src/bitvector.rs index b6e7b34..5ead413 100644 --- a/src/bitvector.rs +++ b/src/bitvector.rs @@ -363,7 +363,7 @@ impl BitVector { #[inline] pub fn get_word(&self, word: usize) -> u64 { #[cfg(feature = "parallel")] - return self.vector[word].load(Ordering::Relaxed) as u64; + return self.vector[word].load(Ordering::Relaxed); #[cfg(not(feature = "parallel"))] return self.vector[word] as u64; diff --git a/src/hashmap.rs b/src/hashmap.rs index 49df9f0..3d54135 100644 --- a/src/hashmap.rs +++ b/src/hashmap.rs @@ -3,17 +3,16 @@ #[cfg(feature = "serde")] use serde::{self, Deserialize, Serialize}; -use crate::Mphf; +use crate::{Mphf, SeedableHash}; use std::borrow::Borrow; use std::fmt::Debug; -use std::hash::Hash; use std::iter::ExactSizeIterator; /// A HashMap data structure where the mapping between keys and values is encoded in a Mphf. This lets us store the keys and values in dense /// arrays, with ~3 bits/item overhead in the Mphf. #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct BoomHashMap { +pub struct BoomHashMap { mphf: Mphf, pub(crate) keys: Vec, pub(crate) values: Vec, @@ -21,7 +20,7 @@ pub struct BoomHashMap { impl BoomHashMap where - K: Hash + Debug + PartialEq, + K: SeedableHash + Debug + PartialEq, D: Debug, { fn create_map(mut keys: Vec, mut values: Vec, mphf: Mphf) -> BoomHashMap { @@ -49,7 +48,7 @@ where pub fn get(&self, kmer: &Q) -> Option<&D> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -69,7 +68,7 @@ where pub fn get_mut(&mut self, kmer: &Q) -> Option<&mut D> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -89,7 +88,7 @@ where pub fn get_key_id(&self, kmer: &Q) -> Option where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -133,7 +132,7 @@ where impl core::iter::FromIterator<(K, D)> for BoomHashMap where - K: Hash + Debug + PartialEq, + K: SeedableHash + Debug + PartialEq, D: Debug, { fn from_iter>(iter: I) -> Self { @@ -149,21 +148,21 @@ where } #[cfg(feature = "parallel")] -pub trait ConstructibleKey: Hash + Debug + PartialEq + Send + Sync {} +pub trait ConstructibleKey: SeedableHash + Debug + PartialEq + Send + Sync {} #[cfg(feature = "parallel")] -impl ConstructibleKey for T where T: Hash + Debug + PartialEq + Send + Sync {} +impl ConstructibleKey for T where T: SeedableHash + Debug + PartialEq + Send + Sync {} #[cfg(not(feature = "parallel"))] -pub trait ConstructibleKey: Hash + Debug + PartialEq {} +pub trait ConstructibleKey: SeedableHash + Debug + PartialEq {} #[cfg(not(feature = "parallel"))] -impl ConstructibleKey for T where T: Hash + Debug + PartialEq {} +impl ConstructibleKey for T where T: SeedableHash + Debug + PartialEq {} #[cfg(feature = "parallel")] impl BoomHashMap where - K: Hash + Debug + PartialEq + Send + Sync, + K: SeedableHash + Debug + PartialEq + Send + Sync, D: Debug, { /// Create a new hash map from the parallel array `keys` and `values`, using a parallelized method to construct the Mphf. @@ -174,12 +173,12 @@ where } /// Iterate over key-value pairs in a BoomHashMap -pub struct BoomIterator<'a, K: Hash + 'a, D: 'a> { +pub struct BoomIterator<'a, K: SeedableHash + 'a, D: 'a> { hash: &'a BoomHashMap, index: usize, } -impl<'a, K: Hash, D> Iterator for BoomIterator<'a, K, D> { +impl<'a, K: SeedableHash, D> Iterator for BoomIterator<'a, K, D> { type Item = (&'a K, &'a D); fn next(&mut self) -> Option { @@ -199,9 +198,9 @@ impl<'a, K: Hash, D> Iterator for BoomIterator<'a, K, D> { } } -impl<'a, K: Hash, D1> ExactSizeIterator for BoomIterator<'a, K, D1> {} +impl<'a, K: SeedableHash, D1> ExactSizeIterator for BoomIterator<'a, K, D1> {} -impl<'a, K: Hash, D> IntoIterator for &'a BoomHashMap { +impl<'a, K: SeedableHash, D> IntoIterator for &'a BoomHashMap { type Item = (&'a K, &'a D); type IntoIter = BoomIterator<'a, K, D>; @@ -219,19 +218,19 @@ impl<'a, K: Hash, D> IntoIterator for &'a BoomHashMap { /// arrays, with ~3 bits/item overhead in the Mphf. #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct BoomHashMap2 { +pub struct BoomHashMap2 { mphf: Mphf, keys: Vec, values: Vec, aux_values: Vec, } -pub struct Boom2Iterator<'a, K: Hash + 'a, D1: 'a, D2: 'a> { +pub struct Boom2Iterator<'a, K: SeedableHash + 'a, D1: 'a, D2: 'a> { hash: &'a BoomHashMap2, index: usize, } -impl<'a, K: Hash, D1, D2> Iterator for Boom2Iterator<'a, K, D1, D2> { +impl<'a, K: SeedableHash, D1, D2> Iterator for Boom2Iterator<'a, K, D1, D2> { type Item = (&'a K, &'a D1, &'a D2); fn next(&mut self) -> Option { @@ -254,9 +253,9 @@ impl<'a, K: Hash, D1, D2> Iterator for Boom2Iterator<'a, K, D1, D2> { } } -impl<'a, K: Hash, D1, D2> ExactSizeIterator for Boom2Iterator<'a, K, D1, D2> {} +impl<'a, K: SeedableHash, D1, D2> ExactSizeIterator for Boom2Iterator<'a, K, D1, D2> {} -impl<'a, K: Hash, D1, D2> IntoIterator for &'a BoomHashMap2 { +impl<'a, K: SeedableHash, D1, D2> IntoIterator for &'a BoomHashMap2 { type Item = (&'a K, &'a D1, &'a D2); type IntoIter = Boom2Iterator<'a, K, D1, D2>; @@ -270,7 +269,7 @@ impl<'a, K: Hash, D1, D2> IntoIterator for &'a BoomHashMap2 { impl BoomHashMap2 where - K: Hash + Debug + PartialEq, + K: SeedableHash + Debug + PartialEq, D1: Debug, D2: Debug, { @@ -310,7 +309,7 @@ where pub fn get(&self, kmer: &Q) -> Option<(&D1, &D2)> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -329,7 +328,7 @@ where pub fn get_mut(&mut self, kmer: &Q) -> Option<(&mut D1, &mut D2)> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -351,7 +350,7 @@ where pub fn get_key_id(&self, kmer: &Q) -> Option where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -395,7 +394,7 @@ where impl core::iter::FromIterator<(K, D1, D2)> for BoomHashMap2 where - K: Hash + Debug + PartialEq, + K: SeedableHash + Debug + PartialEq, D1: Debug, D2: Debug, { @@ -416,7 +415,7 @@ where #[cfg(feature = "parallel")] impl BoomHashMap2 where - K: Hash + Debug + PartialEq + Send + Sync, + K: SeedableHash + Debug + PartialEq + Send + Sync, D1: Debug, D2: Debug, { @@ -500,7 +499,7 @@ where pub fn get(&self, kmer: &Q) -> Option<&D1> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -513,7 +512,7 @@ where pub fn get_mut(&mut self, kmer: &Q) -> Option<&mut D1> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -619,7 +618,7 @@ where pub fn get(&self, kmer: &Q) -> Option<(&D1, &D2)> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); maybe_pos.map(|pos| (&self.values[pos as usize], &self.aux_values[pos as usize])) @@ -629,7 +628,7 @@ where pub fn get_mut(&mut self, kmer: &Q) -> Option<(&mut D1, &mut D2)> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); maybe_pos.map(|pos| { diff --git a/src/lib.rs b/src/lib.rs index 32ca5fc..49eccb1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -52,21 +52,184 @@ use std::sync::{Arc, Mutex}; #[cfg(feature = "serde")] use serde::{self, Deserialize, Serialize}; -#[inline] -fn fold(v: u64) -> u32 { - ((v & 0xFFFFFFFF) as u32) ^ ((v >> 32) as u32) +/// fastmod used to construct the seed as 1 << (iters + iters). However, for external hashing +/// there's a faster path available via lookup tables if we just pass in iters. This method is +/// to ensure that pre-existing hashes continue to work as before when not using ExternallyHashed. +#[inline(always)] +fn default_seed_correction(seed: u64) -> u64 { + 1 << (seed + seed) } -#[inline] -fn hash_with_seed(iter: u64, v: &T) -> u64 { - let mut state = wyhash::WyHash::with_seed(1 << (iter + iter)); - v.hash(&mut state); +fn default_hash_with_seed(value: &T, seed: u64) -> u64 { + let mut state = wyhash::WyHash::with_seed(1 << (seed + seed)); + value.hash(&mut state); state.finish() } +// This custom trait allows us to fast-path &[u8] to avoid constructing the temporary Hasher object. +// Can be simplified once specialization is stabilized. +pub trait SeedableHash { + fn hash_with_seed(&self, seed: u64) -> u64; +} + +impl SeedableHash for [u8] { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(self, default_seed_correction(seed)) + } +} + +impl SeedableHash for [u8; N] { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(self, default_seed_correction(seed)) + } +} + +impl SeedableHash for u8 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&[*self], default_seed_correction(seed)) + } +} + +impl SeedableHash for i16 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for u16 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for i32 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for u32 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for i64 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for u64 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for isize { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for usize { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for &T { + fn hash_with_seed(&self, seed: u64) -> u64 { + (**self).hash_with_seed(seed) + } +} + +impl SeedableHash for &[T] { + fn hash_with_seed(&self, seed: u64) -> u64 { + default_hash_with_seed(self, seed) + } +} + +impl SeedableHash for Vec { + fn hash_with_seed(&self, seed: u64) -> u64 { + default_hash_with_seed(self, seed) + } +} + +impl SeedableHash for &str { + fn hash_with_seed(&self, seed: u64) -> u64 { + default_hash_with_seed(self, seed) + } +} + +impl SeedableHash for String { + fn hash_with_seed(&self, seed: u64) -> u64 { + default_hash_with_seed(self, seed) + } +} + +/// This is a fast-path where the hash for an entry is known externally. That way we can skip hashing the +/// key for building / lookups which provides savings as keys grow longer or you need to do a lookup of the +/// same key across multiple perfect hashes. It's the user's responsibility to construct this with a value +/// that is deterministically derived from a key. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ExternallyHashed(pub u64); + +impl ExternallyHashed { + // Helper function for wyrng. + const fn wymum(a: u64, b: u64) -> u64 { + let mul = a as u128 * b as u128; + ((mul >> 64) ^ mul) as u64 + } + + // wyrng except a constified version + const fn wyrng(seed: u64) -> u64 { + const P0: u64 = 0xa076_1d64_78bd_642f; + const P1: u64 = 0xe703_7ed1_a0b4_28db; + + let seed = seed.wrapping_add(P0); + Self::wymum(seed ^ P1, seed) + } + + // Generate lookup tables to map the hash seed to a random value. + const fn gen_seed_lookups() -> [u64; MAX_ITERS as usize + 1] { + let mut result = [0; MAX_ITERS as usize + 1]; + let mut i = 0; + while i <= MAX_ITERS { + result[i as usize] = Self::wyrng(i); + i += 1; + } + result + } + const SEED_HASH_LOOKUP_TABLES: [u64; MAX_ITERS as usize + 1] = Self::gen_seed_lookups(); + + // Helper utility to convert the seed passed in from hashmod (which is in 0..MAX_ITERS) into a hash. + fn fast_seed_hash(x: u64) -> u64 { + debug_assert!(x <= MAX_ITERS); + Self::SEED_HASH_LOOKUP_TABLES[x as usize] + } + + // Quickly combine two hashes. Because .0 represents a hash, we know it's random and doesn't need to be + // independently hashed again, so we just need to combine it uniquely with iters. + fn hash_combine(h1: u64, h2: u64) -> u64 { + // https://stackoverflow.com/questions/5889238/why-is-xor-the-default-way-to-combine-hashes + h1 ^ (h2 + .wrapping_add(0x517cc1b727220a95) + .wrapping_add(h1 << 6) + .wrapping_add(h1 >> 2)) + } +} + +impl SeedableHash for ExternallyHashed { + #[inline(always)] + fn hash_with_seed(&self, seed: u64) -> u64 { + Self::hash_combine(self.0, Self::fast_seed_hash(seed)) + } +} + #[inline] -fn hash_with_seed32(iter: u64, v: &T) -> u32 { - fold(hash_with_seed(iter, v)) +fn fold(v: u64) -> u32 { + ((v & 0xFFFFFFFF) as u32) ^ ((v >> 32) as u32) } #[inline] @@ -75,15 +238,14 @@ fn fastmod(hash: u32, n: u32) -> u64 { } #[inline] -fn hashmod(iter: u64, v: &T, n: u64) -> u64 { +fn hashmod(iter: u64, v: &T, n: u64) -> u64 { // when n < 2^32, use the fast alternative to modulo described here: // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ + let h = v.hash_with_seed(iter); if n < (1 << 32) { - let h = hash_with_seed32(iter, v); - fastmod(h, n as u32) as u64 + fastmod(fold(h), n as u32) as u64 } else { - let h = hash_with_seed(iter, v); - h % (n as u64) + h % n } } @@ -97,7 +259,52 @@ pub struct Mphf { const MAX_ITERS: u64 = 100; -impl<'a, T: 'a + Hash + Debug> Mphf { +impl Mphf { + fn compute_ranks(bvs: Vec) -> Box<[(BitVector, Box<[u64]>)]> { + let mut ranks = Vec::new(); + let mut pop = 0_u64; + + for bv in bvs { + let mut rank: Vec = Vec::new(); + for i in 0..bv.num_words() { + let v = bv.get_word(i); + + if i % 8 == 0 { + rank.push(pop) + } + + pop += v.count_ones() as u64; + } + + ranks.push((bv, rank.into_boxed_slice())) + } + + ranks.into_boxed_slice() + } + + #[inline] + fn get_rank(&self, hash: u64, i: usize) -> u64 { + let idx = hash as usize; + let (bv, ranks) = self.bitvecs.get(i).expect("that level doesn't exist"); + + // Last pre-computed rank + let mut rank = ranks[idx / 512]; + + // Add rank of intervening words + for j in (idx / 64) & !7..idx / 64 { + rank += bv.get_word(j).count_ones() as u64; + } + + // Add rank of final word up to hash + let final_word = bv.get_word(idx / 64); + if idx % 64 > 0 { + rank += (final_word << (64 - (idx % 64))).count_ones() as u64; + } + rank + } +} + +impl<'a, T: 'a + SeedableHash + Debug> Mphf { /// Constructs an MPHF from a (possibly lazy) iterator over iterators. /// This allows construction of very large MPHFs without holding all the keys /// in memory simultaneously. @@ -127,7 +334,7 @@ impl<'a, T: 'a + Hash + Debug> Mphf { loop { if iter > MAX_ITERS { error!("ran out of key space. items: {:?}", done_keys.len()); - panic!("counldn't find unique hashes"); + panic!("couldn't find unique hashes"); } let keys_remaining = if iter == 0 { @@ -199,7 +406,7 @@ impl<'a, T: 'a + Hash + Debug> Mphf { object_pos = object_index + 1; - let idx = hashmod(seed, &key, size); + let idx = hashmod(seed, &&key, size); if collide.contains(idx) { a.remove(idx); @@ -226,7 +433,7 @@ impl<'a, T: 'a + Hash + Debug> Mphf { } } -impl Mphf { +impl Mphf { /// Generate a minimal perfect hash function for the set of `objects`. /// `objects` must not contain any duplicate items. /// `gamma` controls the tradeoff between the construction-time and run-time speed, @@ -274,49 +481,6 @@ impl Mphf { } } - fn compute_ranks(bvs: Vec) -> Box<[(BitVector, Box<[u64]>)]> { - let mut ranks = Vec::new(); - let mut pop = 0_u64; - - for bv in bvs { - let mut rank: Vec = Vec::new(); - for i in 0..bv.num_words() { - let v = bv.get_word(i); - - if i % 8 == 0 { - rank.push(pop) - } - - pop += v.count_ones() as u64; - } - - ranks.push((bv, rank.into_boxed_slice())) - } - - ranks.into_boxed_slice() - } - - #[inline] - fn get_rank(&self, hash: u64, i: usize) -> u64 { - let idx = hash as usize; - let (bv, ranks) = self.bitvecs.get(i).expect("that level doesn't exist"); - - // Last pre-computed rank - let mut rank = ranks[idx / 512]; - - // Add rank of intervening words - for j in (idx / 64) & !7..idx / 64 { - rank += bv.get_word(j).count_ones() as u64; - } - - // Add rank of final word up to hash - let final_word = bv.get_word(idx / 64); - if idx % 64 > 0 { - rank += (final_word << (64 - (idx % 64))).count_ones() as u64; - } - rank - } - /// Compute the hash value of `item`. This method should only be used /// with items known to be in construction set. Use `try_hash` if you cannot /// guarantee that `item` was in the construction set. If `item` was not present @@ -324,7 +488,7 @@ impl Mphf { pub fn hash(&self, item: &T) -> u64 { for i in 0..self.bitvecs.len() { let (bv, _) = &self.bitvecs[i]; - let hash = hashmod(i as u64, item, bv.capacity() as u64); + let hash = hashmod(i as u64, item, bv.capacity()); if bv.contains(hash) { return self.get_rank(hash, i); @@ -340,11 +504,11 @@ impl Mphf { pub fn try_hash(&self, item: &Q) -> Option where T: Borrow, - Q: ?Sized + Hash, + Q: ?Sized + SeedableHash, { for i in 0..self.bitvecs.len() { let (bv, _) = &(self.bitvecs)[i]; - let hash = hashmod(i as u64, item, bv.capacity() as u64); + let hash = hashmod(i as u64, item, bv.capacity()); if bv.contains(hash) { return Some(self.get_rank(hash, i)); @@ -356,7 +520,7 @@ impl Mphf { } #[cfg(feature = "parallel")] -impl Mphf { +impl Mphf { /// Same as `new`, but parallelizes work on the rayon default Rayon threadpool. /// Configure the number of threads on that threadpool to control CPU usage. #[cfg(feature = "parallel")] @@ -418,7 +582,7 @@ struct Context { impl Context { fn new(size: u64, seed: u64) -> Self { Self { - size: size as u64, + size, seed, a: BitVector::new(size), collide: BitVector::new(size), @@ -426,14 +590,14 @@ impl Context { } #[cfg(feature = "parallel")] - fn find_collisions(&self, v: &T) { + fn find_collisions(&self, v: &T) { let idx = hashmod(self.seed, v, self.size); if !self.collide.contains(idx) && !self.a.insert(idx) { self.collide.insert(idx); } } - fn find_collisions_sync(&mut self, v: &T) { + fn find_collisions_sync(&mut self, v: &T) { let idx = hashmod(self.seed, v, self.size); if !self.collide.contains(idx) && !self.a.insert_sync(idx) { self.collide.insert_sync(idx); @@ -441,7 +605,7 @@ impl Context { } #[cfg(feature = "parallel")] - fn filter<'t, T: Hash>(&self, v: &'t T) -> Option<&'t T> { + fn filter<'t, T: SeedableHash>(&self, v: &'t T) -> Option<&'t T> { let idx = hashmod(self.seed, v, self.size); if self.collide.contains(idx) { self.a.remove(idx); @@ -452,7 +616,7 @@ impl Context { } #[cfg(not(feature = "parallel"))] - fn filter<'t, T: Hash>(&mut self, v: &'t T) -> Option<&'t T> { + fn filter<'t, T: SeedableHash>(&mut self, v: &'t T) -> Option<&'t T> { let idx = hashmod(self.seed, v, self.size); if self.collide.contains(idx) { self.a.remove(idx); @@ -533,7 +697,10 @@ where } #[cfg(feature = "parallel")] -impl<'a, T: 'a + Hash + Debug + Send + Sync> Mphf { +impl<'a, T: 'a + SeedableHash + Debug + Send + Sync> Mphf +where + &'a T: SeedableHash, +{ /// Same as to `from_chunked_iterator` but parallelizes work over `num_threads` threads. #[cfg(feature = "parallel")] pub fn from_chunked_iterator_parallel( @@ -569,7 +736,7 @@ impl<'a, T: 'a + Hash + Debug + Send + Sync> Mphf { loop { if max_iters.is_some() && iter > max_iters.unwrap() { error!("ran out of key space. items: {:?}", global.done_keys.len()); - panic!("counldn't find unique hashes"); + panic!("couldn't find unique hashes"); } let keys_remaining = if iter == 0 { @@ -701,7 +868,7 @@ mod tests { /// Check that a Minimal perfect hash function (MPHF) is generated for the set xs fn check_mphf(xs: HashSet) -> bool where - T: Sync + Hash + PartialEq + Eq + Debug + Send, + T: Sync + SeedableHash + PartialEq + Eq + Debug + Send, { let xsv: Vec = xs.into_iter().collect(); @@ -712,7 +879,7 @@ mod tests { /// Check that a Minimal perfect hash function (MPHF) is generated for the set xs fn check_mphf_serial(xsv: &[T]) -> bool where - T: Hash + PartialEq + Eq + Debug, + T: SeedableHash + PartialEq + Eq + Debug, { // Generate the MPHF let phf = Mphf::new(1.7, xsv); @@ -731,7 +898,7 @@ mod tests { #[cfg(feature = "parallel")] fn check_mphf_parallel(xsv: &[T]) -> bool where - T: Sync + Hash + PartialEq + Eq + Debug + Send, + T: Sync + SeedableHash + PartialEq + Eq + Debug + Send, { // Generate the MPHF let phf = Mphf::new_parallel(1.7, xsv, None); @@ -749,14 +916,14 @@ mod tests { #[cfg(not(feature = "parallel"))] fn check_mphf_parallel(_xsv: &[T]) -> bool where - T: Hash + PartialEq + Eq + Debug, + T: SeedableHash + PartialEq + Eq + Debug, { true } fn check_chunked_mphf(values: Vec>, total: u64) -> bool where - T: Sync + Hash + PartialEq + Eq + Debug + Send, + T: Sync + SeedableHash + PartialEq + Eq + Debug + Send, { let phf = Mphf::from_chunked_iterator(1.7, &values, total); @@ -776,7 +943,7 @@ mod tests { #[cfg(feature = "parallel")] fn check_chunked_mphf_parallel(values: Vec>, total: u64) -> bool where - T: Sync + Hash + PartialEq + Eq + Debug + Send, + T: Sync + SeedableHash + PartialEq + Eq + Debug + Send, { let phf = Mphf::from_chunked_iterator_parallel(1.7, &values, None, total, 2); @@ -883,4 +1050,27 @@ mod tests { let items = (0..1000000).map(|x| x * 2); assert!(check_mphf(HashSet::from_iter(items))); } + + #[test] + fn externally_hashed() { + let total = 1000000; + // User gets to pick the hash function. + let entries = (0..total) + .map(|x| ExternallyHashed(wyhash::wyrng(&mut (x * 2)))) + .collect::>(); + let phf = Mphf::new(1.7, &entries); + + let mut hashes = entries.iter().map(|eh| phf.hash(eh)).collect::>(); + hashes.sort_unstable(); + + let gt = (0..total as u64).collect::>(); + assert_eq!(hashes, gt); + + // Hand-picked a value that fails to hash since it's not in the original set that it's built from. + // It's not ideal that this assertion is sensitive to the implementation details internal to Mphf. + assert_eq!( + phf.try_hash(&ExternallyHashed(wyhash::wyrng(&mut 1000129))), + None + ); + } } diff --git a/src/par_iter.rs b/src/par_iter.rs index 0ced54d..a00c530 100644 --- a/src/par_iter.rs +++ b/src/par_iter.rs @@ -1,12 +1,11 @@ -use std::hash::Hash; - use crate::hashmap::BoomHashMap; +use crate::SeedableHash; use rayon::iter::plumbing::{bridge, Consumer, Producer, ProducerCallback, UnindexedConsumer}; use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; impl<'data, K, V> IntoParallelIterator for &'data BoomHashMap where - K: Hash + Sync + 'data, + K: SeedableHash + Sync + 'data, V: Sync + 'data, { type Item = (&'data K, &'data V);