Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/main/rust/rust-toolchain.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[toolchain]
channel = "nightly"
126 changes: 124 additions & 2 deletions src/main/rust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
//! With this implementation it is necessary to parse the field metadata once on each side of the
//! FFI boundary since this implementation does not interact with IndexInput.

#![feature(stdarch_aarch64_prefetch)]

pub mod direct_monotonic_reader;
pub mod vint;

use std::arch::aarch64::{vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32};
use std::collections::BinaryHeap;
use std::iter::FusedIterator;

Expand Down Expand Up @@ -331,6 +334,48 @@ impl FieldVectorData {
score: self.similarity.score(query, self.get(ord as usize)),
}
}

#[inline(always)]
pub fn score_many_ords(&self, query: &[f32], mut ords: &[u32], mut scores: &mut [f32]) {
let mut ord_chunks = ords.chunks_exact(8);
let mut score_chunks = scores.chunks_exact_mut(8);
for (ord_chunk, score_chunk) in ord_chunks.by_ref().zip(score_chunks.by_ref()) {
let docs = [
self.get(ord_chunk[0] as usize),
self.get(ord_chunk[1] as usize),
self.get(ord_chunk[2] as usize),
self.get(ord_chunk[3] as usize),
self.get(ord_chunk[4] as usize),
self.get(ord_chunk[5] as usize),
self.get(ord_chunk[6] as usize),
self.get(ord_chunk[7] as usize),
];
score_chunk.copy_from_slice(&self.similarity.score_many::<8, 128>(query, docs));
}
ords = ord_chunks.remainder();
scores = score_chunks.into_remainder();
if ords.len() >= 4 {
let docs = [
self.get(ords[0] as usize),
self.get(ords[1] as usize),
self.get(ords[2] as usize),
self.get(ords[3] as usize),
];
scores[..4].copy_from_slice(&self.similarity.score_many::<4, 128>(query, docs));
ords = &ords[4..];
scores = &mut scores[4..];
if ords.len() >= 2 {
let docs = [self.get(ords[0] as usize), self.get(ords[1] as usize)];
scores[..2].copy_from_slice(&self.similarity.score_many::<2, 128>(query, docs));
ords = &ords[2..];
scores = &mut scores[2..];

if ords.len() == 1 {
scores[0] = self.similarity.score(query, self.get(ords[0] as usize));
}
}
}
}
}

#[derive(Debug, PartialEq, Eq, Copy, Clone)]
Expand Down Expand Up @@ -384,12 +429,81 @@ impl VectorSimilarity {
Self::DotProduct => {
// I tried a manual aarch64 SIMD implementation where I unrolled the loop (16d)
// and it was not any faster, maybe actually slower.
0.0f32.max((1.0f32 + SpatialSimilarity::dot(q, d).unwrap() as f32) / 2.0f32)
// XXX 0.0f32.max((1.0f32 + SpatialSimilarity::dot(q, d).unwrap() as f32) / 2.0f32)
0.0f32.max((1.0f32 + Self::dot(q, d)) / 2.0f32)
}
Self::Cosine => unimplemented!(),
Self::MaximumInnerProduct => unimplemented!(),
}
}

pub fn score_many<const N: usize, const P: usize>(
&self,
q: &[f32],
docs: [&[f32]; N],
) -> [f32; N] {
match self {
Self::DotProduct => {
let mut scores = Self::dot_many::<N, P>(q, docs);
for n in 0..N {
scores[n] = 0.0f32.max(1.0f32 + scores[n]) / 2.0f32;
}
scores
}
_ => unimplemented!(),
}
}

fn dot(q: &[f32], d: &[f32]) -> f32 {
unsafe {
let mut dot = vdupq_n_f32(0.0);
for offset in (0..q.len()).step_by(4) {
if offset % 16 == 0 && offset + 128 < q.len() {
core::arch::aarch64::_prefetch::<0, 3>(
q.as_ptr().add(offset + 128) as *const i8
);
}
let qc = vld1q_f32(q.as_ptr().add(offset));
let dc = vld1q_f32(d.as_ptr().add(offset));
dot = vfmaq_f32(dot, qc, dc);
}
vaddvq_f32(dot)
}
}

fn dot_many<const N: usize, const P: usize>(q: &[f32], docs: [&[f32]; N]) -> [f32; N] {
unsafe {
for offset in (0..P).step_by(16) {
for n in 0..N {
core::arch::aarch64::_prefetch::<0, 3>(
docs[n].as_ptr().add(offset) as *const i8
);
}
}
let mut dot = [vdupq_n_f32(0.0); N];
for offset in (0..q.len()).step_by(4) {
let prefetch_offset = P + offset;
if prefetch_offset % 16 == 0 && prefetch_offset < q.len() {
for n in 0..N {
core::arch::aarch64::_prefetch::<0, 3>(
docs[n].as_ptr().add(prefetch_offset) as *const i8,
);
}
}

let qv = vld1q_f32(q.as_ptr().add(offset));
for n in 0..N {
dot[n] = vfmaq_f32(dot[n], qv, vld1q_f32(docs[n].as_ptr().add(offset)));
}
}

let mut scores = [0.0f32; N];
for n in 0..N {
scores[n] = vaddvq_f32(dot[n]);
}
scores
}
}
}

impl TryFrom<i32> for VectorSimilarity {
Expand Down Expand Up @@ -618,6 +732,8 @@ fn search_index(
{
queue.push(best_entry_point);
}
let mut ords = Vec::with_capacity(index.max_edges * 2);
let mut scores = Vec::with_capacity(index.max_edges * 2);
while let Some(candidate) = candidates.pop_min() {
// If the best candidate is worse than the worst result, break.
if candidate.score < queue.min_similarity() {
Expand All @@ -629,7 +745,13 @@ fn search_index(
continue;
}

let n = vector_data.score_ord(query, vertex);
ords.push(vertex);
}

scores.resize(ords.len(), 0.0);
vector_data.score_many_ords(query, &ords, &mut scores);
for (vertex, score) in ords.drain(..).zip(scores.drain(..)) {
let n = Neighbor { vertex, score };
// This differs from lucene in that we limit the length of the queue.
if candidates.len() < queue.len() {
candidates.push(n);
Expand Down