Skip to content

Commit 210c770

Browse files
authored
feat(array): use fixed chunk array for vectors (#869)
requires: #866 This patch adds a new array type for vector arrays. The array stores primitive type chunks of the same length. Trying to inserting chunks of different sizes will panic. #864 --------- Signed-off-by: Alex Chi Z <iskyzh@gmail.com>
1 parent 9b3516f commit 210c770

File tree

4 files changed

+262
-3
lines changed

4 files changed

+262
-3
lines changed

src/array/chunked_array.rs

Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
// Copyright 2025 RisingLight Project Authors. Licensed under Apache-2.0.
2+
3+
use std::borrow::Borrow;
4+
use std::marker::PhantomData;
5+
use std::mem;
6+
7+
use bitvec::vec::BitVec;
8+
use serde::{Deserialize, Serialize};
9+
10+
use super::{
11+
Array, ArrayBuilder, ArrayEstimateExt, ArrayFromDataExt, ArrayValidExt, PrimitiveValueType,
12+
ValueRef,
13+
};
14+
use crate::types::{VectorRef, F64};
15+
16+
// A collection of fixed-length values.
17+
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
18+
pub struct ChunkedArray<T: ValueRef<U> + ?Sized, U: PrimitiveValueType = u8> {
19+
/// The length of each element. If the array is empty, it could be 0 instead of the actual
20+
/// type's length.
21+
element_length: usize,
22+
valid: BitVec,
23+
data: Box<[U]>,
24+
_type: PhantomData<T>,
25+
}
26+
27+
pub type VectorArray = ChunkedArray<VectorRef, F64>;
28+
pub type VectorArrayBuilder = ChunkedArrayBuilder<VectorRef, F64>;
29+
30+
impl<T: ValueRef<U> + ?Sized, U: PrimitiveValueType> Clone for ChunkedArray<T, U> {
31+
fn clone(&self) -> Self {
32+
Self {
33+
element_length: self.element_length,
34+
valid: self.valid.clone(),
35+
data: self.data.clone(),
36+
_type: PhantomData,
37+
}
38+
}
39+
}
40+
41+
impl<T: ValueRef<U> + ?Sized, U: PrimitiveValueType> Array for ChunkedArray<T, U> {
42+
type Item = T;
43+
type Builder = ChunkedArrayBuilder<T, U>;
44+
45+
fn is_null(&self, idx: usize) -> bool {
46+
!self.valid[idx]
47+
}
48+
49+
fn get_raw(&self, idx: usize) -> &T {
50+
let data_slice = &self.data[self.element_length * idx..self.element_length * (idx + 1)];
51+
T::from_primitives(data_slice)
52+
}
53+
54+
fn len(&self) -> usize {
55+
self.valid.len()
56+
}
57+
58+
fn filter(&self, p: &[bool]) -> Self {
59+
assert_eq!(p.len(), self.len());
60+
let mut builder = Self::Builder::with_capacity(self.len());
61+
for (i, &v) in p.iter().enumerate() {
62+
if v {
63+
builder.push(self.get(i));
64+
}
65+
}
66+
builder.finish()
67+
}
68+
}
69+
70+
impl<T: ValueRef<U> + ?Sized, U: PrimitiveValueType> ArrayValidExt for ChunkedArray<T, U> {
71+
fn get_valid_bitmap(&self) -> &BitVec {
72+
&self.valid
73+
}
74+
fn get_valid_bitmap_mut(&mut self) -> &mut BitVec {
75+
&mut self.valid
76+
}
77+
}
78+
79+
impl<T: ValueRef<U> + ?Sized, U: PrimitiveValueType> ArrayEstimateExt for ChunkedArray<T, U> {
80+
fn get_estimated_size(&self) -> usize {
81+
self.data.len() + self.valid.len() / 8
82+
}
83+
}
84+
85+
impl<T: ValueRef<U> + ?Sized, U: PrimitiveValueType> ArrayFromDataExt for ChunkedArray<T, U> {
86+
fn from_data(data_iter: impl Iterator<Item = impl Borrow<Self::Item>>, valid: BitVec) -> Self {
87+
let mut data = Vec::with_capacity(valid.len());
88+
let mut element_length = None;
89+
for raw in data_iter {
90+
data.extend_from_slice(raw.borrow().as_ref());
91+
element_length = Some(raw.borrow().as_ref().len());
92+
}
93+
Self {
94+
valid,
95+
data: data.into(),
96+
element_length: element_length.unwrap_or_default(),
97+
_type: PhantomData,
98+
}
99+
}
100+
}
101+
102+
/// A builder that uses `&T` to build an [`BytesArray`].
103+
pub struct ChunkedArrayBuilder<T: ValueRef<U> + ?Sized, U: PrimitiveValueType = u8> {
104+
element_length: Option<usize>,
105+
valid: BitVec,
106+
data: Vec<U>,
107+
_type: PhantomData<T>,
108+
}
109+
110+
impl<T: ValueRef<U> + ?Sized, U: PrimitiveValueType> ChunkedArrayBuilder<T, U> {
111+
fn update_element_length(&mut self, length: usize) {
112+
if let Some(element_length) = self.element_length {
113+
assert_eq!(element_length, length);
114+
} else {
115+
self.element_length = Some(length);
116+
}
117+
}
118+
}
119+
120+
impl<T: ValueRef<U> + ?Sized, U: PrimitiveValueType> ArrayBuilder for ChunkedArrayBuilder<T, U> {
121+
type Array = ChunkedArray<T, U>;
122+
123+
fn extend_from_raw_data(&mut self, raws: &[<<Self::Array as Array>::Item as ToOwned>::Owned]) {
124+
for raw in raws {
125+
self.data.extend_from_slice(raw.borrow().as_ref());
126+
self.update_element_length(raw.borrow().as_ref().len());
127+
}
128+
}
129+
130+
fn extend_from_nulls(&mut self, _: usize) {
131+
panic!("null value in chunked array builder");
132+
}
133+
134+
fn replace_bitmap(&mut self, valid: BitVec) {
135+
let _ = mem::replace(&mut self.valid, valid);
136+
}
137+
138+
fn with_capacity(capacity: usize) -> Self {
139+
Self {
140+
element_length: None,
141+
valid: BitVec::with_capacity(capacity),
142+
data: Vec::with_capacity(capacity),
143+
_type: PhantomData,
144+
}
145+
}
146+
147+
fn reserve(&mut self, capacity: usize) {
148+
self.valid.reserve(capacity);
149+
// For variable-length values, we cannot know the exact size of the value.
150+
// Therefore, we reserve `capacity` here, but it may overflow during use.
151+
self.data.reserve(capacity);
152+
}
153+
154+
fn push(&mut self, value: Option<&T>) {
155+
self.valid.push(value.is_some());
156+
if let Some(x) = value {
157+
self.data.extend_from_slice(x.as_ref());
158+
self.update_element_length(x.as_ref().len());
159+
} else {
160+
panic!("null value in chunked array builder");
161+
}
162+
}
163+
164+
fn push_n(&mut self, n: usize, value: Option<&T>) {
165+
self.valid.resize(self.valid.len() + n, value.is_some());
166+
if let Some(value) = value {
167+
self.data.reserve(value.as_ref().len() * n);
168+
self.update_element_length(value.as_ref().len());
169+
// TODO: optimize: push the value only once
170+
for _ in 0..n {
171+
self.data.extend_from_slice(value.as_ref());
172+
}
173+
} else {
174+
panic!("null value in chunked array builder");
175+
}
176+
}
177+
178+
fn append(&mut self, other: &ChunkedArray<T, U>) {
179+
self.valid.extend_from_bitslice(&other.valid);
180+
self.data.extend_from_slice(&other.data);
181+
self.update_element_length(other.element_length);
182+
}
183+
184+
fn take(&mut self) -> ChunkedArray<T, U> {
185+
ChunkedArray {
186+
valid: mem::take(&mut self.valid),
187+
data: mem::take(&mut self.data).into(),
188+
element_length: self.element_length.unwrap_or_default(),
189+
_type: PhantomData,
190+
}
191+
}
192+
}
193+
194+
#[allow(dead_code)]
195+
struct ChunkedArrayWriter<'a, T: ValueRef<U> + ?Sized, U: PrimitiveValueType> {
196+
builder: &'a mut ChunkedArrayBuilder<T, U>,
197+
written_length: usize,
198+
}
199+
200+
impl<T: ValueRef<U> + ?Sized, U: PrimitiveValueType> ChunkedArrayWriter<'_, T, U> {
201+
#[allow(dead_code)]
202+
fn write_chunk(&mut self, s: &[U]) {
203+
self.builder.data.extend_from_slice(s);
204+
self.written_length += s.len();
205+
}
206+
}
207+
208+
impl<T: ValueRef<U> + ?Sized, U: PrimitiveValueType> Drop for ChunkedArrayWriter<'_, T, U> {
209+
fn drop(&mut self) {
210+
self.builder.update_element_length(self.written_length);
211+
self.builder.valid.push(true);
212+
}
213+
}
214+
215+
// Enable `collect()` an array from iterator of `Option<&T>` or `Option<T::Owned>`.
216+
impl<O: AsRef<T>, T: ValueRef<U> + ?Sized, U: PrimitiveValueType> FromIterator<Option<O>>
217+
for ChunkedArray<T, U>
218+
{
219+
fn from_iter<I: IntoIterator<Item = Option<O>>>(iter: I) -> Self {
220+
let iter = iter.into_iter();
221+
let mut builder = <Self as Array>::Builder::with_capacity(iter.size_hint().0);
222+
for e in iter {
223+
if let Some(s) = e {
224+
builder.push(Some(s.as_ref()));
225+
} else {
226+
builder.push(None);
227+
}
228+
}
229+
builder.finish()
230+
}
231+
}
232+
233+
#[cfg(test)]
234+
mod tests {
235+
use super::*;
236+
#[test]
237+
fn test_vector_array_builder() {
238+
let mut builder = VectorArrayBuilder::with_capacity(100);
239+
for i in 0..100 {
240+
if i % 2 == 0 {
241+
builder.push(Some(VectorRef::new(&[
242+
F64::from(i),
243+
F64::from(i * 2),
244+
F64::from(i * 3),
245+
])));
246+
} else {
247+
builder.push(Some(VectorRef::new(&[
248+
F64::from(i * 4),
249+
F64::from(i * 5),
250+
F64::from(i * 6),
251+
])));
252+
}
253+
}
254+
builder.finish();
255+
}
256+
}

src/array/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@ use crate::types::{
1515
F64,
1616
};
1717

18+
mod chunked_array;
1819
mod data_chunk;
1920
mod data_chunk_builder;
2021
pub mod ops;
2122
mod primitive_array;
2223
mod var_array;
2324

25+
pub use self::chunked_array::*;
2426
pub use self::data_chunk::*;
2527
pub use self::data_chunk_builder::*;
2628
pub use self::primitive_array::*;

src/array/var_array.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ pub struct VarArray<T: ValueRef<U> + ?Sized, U: PrimitiveValueType = u8> {
2020
_type: PhantomData<T>,
2121
}
2222

23-
pub trait PrimitiveValueType: Send + Sync + 'static + Copy + Clone {}
23+
pub trait PrimitiveValueType: Send + Sync + 'static + Copy + Clone + Default {}
2424

2525
impl PrimitiveValueType for u8 {}
2626
impl PrimitiveValueType for F64 {}
@@ -56,10 +56,8 @@ impl ValueRef<F64> for VectorRef {
5656

5757
pub type StringArray = VarArray<str>;
5858
pub type BlobArray = VarArray<BlobRef>;
59-
pub type VectorArray = VarArray<VectorRef, F64>;
6059
pub type StringArrayBuilder = BytesArrayBuilder<str>;
6160
pub type BlobArrayBuilder = BytesArrayBuilder<BlobRef>;
62-
pub type VectorArrayBuilder = BytesArrayBuilder<VectorRef, F64>;
6361

6462
impl<T: ValueRef<U> + ?Sized, U: PrimitiveValueType> Clone for VarArray<T, U> {
6563
fn clone(&self) -> Self {

tests/sql/vector.slt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
statement ok
33
create table t (a vector(3) not null);
44

5+
statement ok
6+
select * from t;
7+
58
statement ok
69
insert into t values ('[-1, -2.0, -3]'), ('[1, 2.0, 3]');
710

0 commit comments

Comments
 (0)