Skip to content

Commit e872158

Browse files
committed
Improve alignment
Signed-off-by: Heinz N. Gies <[email protected]>
1 parent 3029ff5 commit e872158

File tree

8 files changed

+80
-97
lines changed

8 files changed

+80
-97
lines changed

src/impls/avx2/stage1.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,13 @@ impl Stage1Parse for SimdInput {
4141
type Utf8Validator = simdutf8::basic::imp::x86::avx2::ChunkedUtf8ValidatorImp;
4242
type SimdRepresentation = __m256i;
4343
#[cfg_attr(not(feature = "no-inline"), inline)]
44-
// _mm256_loadu_si256 does not need alignment
44+
// _mm256_loadu_si256 does not need alignment we allign our input so we can use _mm256_loadu_si256
4545
#[allow(clippy::cast_ptr_alignment)]
4646
#[target_feature(enable = "avx2")]
47-
unsafe fn new(ptr: &[u8]) -> Self {
47+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
4848
Self {
49-
v0: _mm256_loadu_si256(ptr.as_ptr().cast::<__m256i>()),
50-
v1: _mm256_loadu_si256(ptr.as_ptr().add(32).cast::<__m256i>()),
49+
v0: _mm256_load_si256(ptr.as_ptr().cast::<__m256i>()),
50+
v1: _mm256_load_si256(ptr.as_ptr().add(32).cast::<__m256i>()),
5151
}
5252
}
5353

src/impls/native/stage1.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -296,12 +296,12 @@ pub(crate) struct SimdInput {
296296
impl Stage1Parse for SimdInput {
297297
type Utf8Validator = super::ChunkedUtf8ValidatorImp;
298298
type SimdRepresentation = V128;
299-
unsafe fn new(ptr: &[u8]) -> Self {
299+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
300300
SimdInput {
301-
v0: *(ptr.as_ptr().cast::<V128>()),
302-
v1: *(ptr.as_ptr().add(16).cast::<V128>()),
303-
v2: *(ptr.as_ptr().add(32).cast::<V128>()),
304-
v3: *(ptr.as_ptr().add(48).cast::<V128>()),
301+
v0: ptr.as_ptr().cast::<V128>().read(),
302+
v1: ptr.as_ptr().add(16).cast::<V128>().read(),
303+
v2: ptr.as_ptr().add(32).cast::<V128>().read(),
304+
v3: ptr.as_ptr().add(48).cast::<V128>().read(),
305305
}
306306
}
307307

src/impls/neon/stage1.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::{static_cast_i32, Stage1Parse};
1+
use crate::{static_cast_i32, Stage1Parse, SIMDINPUT_LENGTH};
22
use std::arch::aarch64::{
33
int32x4_t, int8x16_t, uint8x16_t, vaddq_s32, vandq_u8, vceqq_u8, vcleq_u8, vdupq_n_s8,
44
vgetq_lane_u64, vld1q_u8, vmovq_n_u8, vpaddq_u8, vqtbl1q_u8, vreinterpretq_u64_u8,
@@ -53,12 +53,12 @@ impl Stage1Parse for SimdInput {
5353
type Utf8Validator = simdutf8::basic::imp::aarch64::neon::ChunkedUtf8ValidatorImp;
5454
type SimdRepresentation = int8x16_t;
5555
#[cfg_attr(not(feature = "no-inline"), inline)]
56-
unsafe fn new(ptr: &[u8]) -> Self {
56+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
5757
Self {
58-
v0: vld1q_u8(ptr.as_ptr().cast::<u8>()),
59-
v1: vld1q_u8(ptr.as_ptr().add(16).cast::<u8>()),
60-
v2: vld1q_u8(ptr.as_ptr().add(32).cast::<u8>()),
61-
v3: vld1q_u8(ptr.as_ptr().add(48).cast::<u8>()),
58+
v0: vld1q_u8(ptr.as_ptr()),
59+
v1: vld1q_u8(ptr.as_ptr().add(16)),
60+
v2: vld1q_u8(ptr.as_ptr().add(32)),
61+
v3: vld1q_u8(ptr.as_ptr().add(48)),
6262
}
6363
}
6464

src/impls/portable/stage1.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ impl Stage1Parse for SimdInput {
1010
type Utf8Validator = simdutf8::basic::imp::portable::ChunkedUtf8ValidatorImp;
1111
type SimdRepresentation = u8x64;
1212
#[cfg_attr(not(feature = "no-inline"), inline)]
13-
unsafe fn new(ptr: &[u8]) -> Self {
13+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
1414
Self {
15-
v: u8x64::from_array(*ptr.as_ptr().cast::<[u8; 64]>()),
15+
v: u8x64::from_array(ptr),
1616
}
1717
}
1818

src/impls/simd128/stage1.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ impl Stage1Parse for SimdInput {
1818

1919
#[cfg_attr(not(feature = "no-inline"), inline)]
2020
#[allow(clippy::cast_ptr_alignment)]
21-
unsafe fn new(ptr: &[u8]) -> Self {
21+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
2222
Self {
2323
v0: v128_load(ptr.as_ptr().cast::<v128>()),
2424
v1: v128_load(ptr.as_ptr().add(16).cast::<v128>()),

src/impls/sse42/stage1.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,12 @@ impl Stage1Parse for SimdInput {
4545
#[target_feature(enable = "sse4.2")]
4646
#[cfg_attr(not(feature = "no-inline"), inline)]
4747
#[allow(clippy::cast_ptr_alignment)]
48-
unsafe fn new(ptr: &[u8]) -> Self {
48+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
4949
Self {
50-
v0: _mm_loadu_si128(ptr.as_ptr().cast::<arch::__m128i>()),
51-
v1: _mm_loadu_si128(ptr.as_ptr().add(16).cast::<arch::__m128i>()),
52-
v2: _mm_loadu_si128(ptr.as_ptr().add(32).cast::<arch::__m128i>()),
53-
v3: _mm_loadu_si128(ptr.as_ptr().add(48).cast::<arch::__m128i>()),
50+
v0: _mm_load_si128(ptr.as_ptr().cast::<arch::__m128i>()),
51+
v1: _mm_load_si128(ptr.as_ptr().add(16).cast::<arch::__m128i>()),
52+
v2: _mm_load_si128(ptr.as_ptr().add(32).cast::<arch::__m128i>()),
53+
v3: _mm_load_si128(ptr.as_ptr().add(48).cast::<arch::__m128i>()),
5454
}
5555
}
5656

src/lib.rs

Lines changed: 35 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ pub(crate) trait Stage1Parse {
242242
type Utf8Validator: ChunkedUtf8Validator;
243243
type SimdRepresentation;
244244

245-
unsafe fn new(ptr: &[u8]) -> Self;
245+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self;
246246

247247
unsafe fn compute_quote_mask(quote_bits: u64) -> u64;
248248

@@ -799,10 +799,11 @@ impl<'de> Deserializer<'de> {
799799
#[cfg(all(target_arch = "aarch64", not(feature = "portable")))]
800800
#[cfg_attr(not(feature = "no-inline"), inline)]
801801
pub(crate) unsafe fn find_structural_bits(
802-
input: &[u8],
802+
input: &AlignedBuf,
803+
len: usize,
803804
structural_indexes: &mut Vec<u32>,
804805
) -> std::result::Result<(), ErrorType> {
805-
Self::_find_structural_bits::<impls::neon::SimdInput>(input, structural_indexes)
806+
Self::_find_structural_bits::<impls::neon::SimdInput>(input, len, structural_indexes)
806807
}
807808

808809
#[cfg(all(target_feature = "simd128", not(feature = "portable")))]
@@ -859,7 +860,7 @@ impl<'de> Deserializer<'de> {
859860
buffer: &mut Buffers,
860861
tape: &mut Vec<Node<'de>>,
861862
) -> Result<()> {
862-
const LOTS_OF_ZOERS: [u8; SIMDINPUT_LENGTH] = [0; SIMDINPUT_LENGTH];
863+
const LOTS_OF_ZOERS: [u8; SIMDINPUT_LENGTH] = [0x20; SIMDINPUT_LENGTH];
863864
let len = input.len();
864865
let simd_safe_len = len + SIMDINPUT_LENGTH;
865866

@@ -894,7 +895,7 @@ impl<'de> Deserializer<'de> {
894895
// safety: all bytes are initialized
895896
input_buffer.set_len(simd_safe_len);
896897

897-
Self::find_structural_bits(input, &mut buffer.structural_indexes)
898+
Self::find_structural_bits(input_buffer, input.len(), &mut buffer.structural_indexes)
898899
.map_err(Error::generic)?;
899900
};
900901

@@ -945,10 +946,11 @@ impl<'de> Deserializer<'de> {
945946
#[cfg_attr(not(feature = "no-inline"), inline)]
946947
#[allow(clippy::cast_possible_truncation)]
947948
pub(crate) unsafe fn _find_structural_bits<S: Stage1Parse>(
948-
input: &[u8],
949+
input: &AlignedBuf,
950+
len: usize,
949951
structural_indexes: &mut Vec<u32>,
950952
) -> std::result::Result<(), ErrorType> {
951-
let len = input.len();
953+
// let len = input.len();
952954
// 8 is a heuristic number to estimate it turns out a rate of 1/8 structural characters
953955
// leads almost never to relocations.
954956
structural_indexes.clear();
@@ -980,18 +982,18 @@ impl<'de> Deserializer<'de> {
980982
// expensive carryless multiply in the previous step with this work
981983
let mut structurals: u64 = 0;
982984

983-
let lenminus64: usize = if len < 64 { 0 } else { len - 64 };
985+
// let lenminus64: usize = if len < 64 { 0 } else { len - 64 };
984986
let mut idx: usize = 0;
985987
let mut error_mask: u64 = 0; // for unescaped characters within strings (ASCII code points < 0x20)
986988

987-
while idx < lenminus64 {
989+
while idx <= len / SIMDINPUT_LENGTH {
988990
/*
989991
#ifndef _MSC_VER
990992
__builtin_prefetch(buf + idx + 128);
991993
#endif
992994
*/
993-
let chunk = input.get_kinda_unchecked(idx..idx + 64);
994-
utf8_validator.update_from_chunks(chunk);
995+
let chunk: [u8; SIMDINPUT_LENGTH] = input.load_register(idx);
996+
utf8_validator.update_from_chunks(&chunk);
995997

996998
let input = S::new(chunk);
997999
// detect odd sequences of backslashes
@@ -1010,7 +1012,7 @@ impl<'de> Deserializer<'de> {
10101012

10111013
// take the previous iterations structural bits, not our current iteration,
10121014
// and flatten
1013-
S::flatten_bits(structural_indexes, idx as u32, structurals);
1015+
S::flatten_bits(structural_indexes, (idx * 64) as u32, structurals);
10141016

10151017
let mut whitespace: u64 = 0;
10161018
input.find_whitespace_and_structurals(&mut whitespace, &mut structurals);
@@ -1023,58 +1025,15 @@ impl<'de> Deserializer<'de> {
10231025
quote_bits,
10241026
&mut prev_iter_ends_pseudo_pred,
10251027
);
1026-
idx += SIMDINPUT_LENGTH;
1028+
idx += 1;
10271029
}
10281030

1029-
// we use a giant copy-paste which is ugly.
1030-
// but otherwise the string needs to be properly padded or else we
1031-
// risk invalidating the UTF-8 checks.
1032-
if idx < len {
1033-
let mut tmpbuf: [u8; SIMDINPUT_LENGTH] = [0x20; SIMDINPUT_LENGTH];
1034-
tmpbuf
1035-
.as_mut_ptr()
1036-
.copy_from(input.as_ptr().add(idx), len - idx);
1037-
utf8_validator.update_from_chunks(&tmpbuf);
1038-
1039-
let input = S::new(&tmpbuf);
1040-
1041-
// detect odd sequences of backslashes
1042-
let odd_ends: u64 =
1043-
input.find_odd_backslash_sequences(&mut prev_iter_ends_odd_backslash);
1044-
1045-
// detect insides of quote pairs ("quote_mask") and also our quote_bits
1046-
// themselves
1047-
let mut quote_bits: u64 = 0;
1048-
let quote_mask: u64 = input.find_quote_mask_and_bits(
1049-
odd_ends,
1050-
&mut prev_iter_inside_quote,
1051-
&mut quote_bits,
1052-
&mut error_mask,
1053-
);
1054-
1055-
// take the previous iterations structural bits, not our current iteration,
1056-
// and flatten
1057-
S::flatten_bits(structural_indexes, idx as u32, structurals);
1058-
1059-
let mut whitespace: u64 = 0;
1060-
input.find_whitespace_and_structurals(&mut whitespace, &mut structurals);
1061-
1062-
// fixup structurals to reflect quotes and add pseudo-structural characters
1063-
structurals = S::finalize_structurals(
1064-
structurals,
1065-
whitespace,
1066-
quote_mask,
1067-
quote_bits,
1068-
&mut prev_iter_ends_pseudo_pred,
1069-
);
1070-
idx += SIMDINPUT_LENGTH;
1071-
}
10721031
// This test isn't in upstream, for some reason the error mask is et for then.
10731032
if prev_iter_inside_quote != 0 {
10741033
return Err(ErrorType::Syntax);
10751034
}
10761035
// finally, flatten out the remaining structurals from the last iteration
1077-
S::flatten_bits(structural_indexes, idx as u32, structurals);
1036+
S::flatten_bits(structural_indexes, (idx * 64) as u32, structurals);
10781037

10791038
// a valid JSON file cannot have zero structural indexes - we should have
10801039
// found something (note that we compare to 1 as we always add the root!)
@@ -1113,13 +1072,21 @@ impl AlignedBuf {
11131072
/// Creates a new buffer that is aligned with the simd register size
11141073
#[must_use]
11151074
pub fn with_capacity(capacity: usize) -> Self {
1116-
let layout = match Layout::from_size_align(capacity, SIMDJSON_PADDING) {
1117-
Ok(layout) => layout,
1118-
Err(_) => Self::capacity_overflow(),
1075+
let offset = capacity % SIMDINPUT_LENGTH;
1076+
let capacity = if offset == 0 {
1077+
capacity
1078+
} else {
1079+
capacity + SIMDINPUT_LENGTH - offset
11191080
};
1081+
11201082
if mem::size_of::<usize>() < 8 && capacity > isize::MAX as usize {
11211083
Self::capacity_overflow()
11221084
}
1085+
let layout = match Layout::from_size_align(capacity, SIMDINPUT_LENGTH) {
1086+
Ok(layout) => layout,
1087+
Err(_) => Self::capacity_overflow(),
1088+
};
1089+
11231090
let inner = match unsafe { NonNull::new(alloc(layout)) } {
11241091
Some(ptr) => ptr,
11251092
None => handle_alloc_error(layout),
@@ -1132,6 +1099,14 @@ impl AlignedBuf {
11321099
}
11331100
}
11341101

1102+
unsafe fn load_register(&self, idx: usize) -> [u8; SIMDINPUT_LENGTH] {
1103+
self.inner
1104+
.as_ptr()
1105+
.cast::<[u8; SIMDINPUT_LENGTH]>()
1106+
.add(idx)
1107+
.read()
1108+
}
1109+
11351110
fn as_mut_ptr(&mut self) -> *mut u8 {
11361111
self.inner.as_ptr()
11371112
}

src/tests/impls.rs

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,31 @@
1-
use crate::{impls, Deserializer, Stage1Parse, SIMDJSON_PADDING};
1+
use crate::{impls, AlignedBuf, Deserializer, Stage1Parse, SIMDINPUT_LENGTH};
22

33
fn test_find_structural_bits<S: Stage1Parse>(input_str: &str, expected: &[u32]) {
4-
let mut input = input_str.as_bytes().to_vec();
5-
input.append(&mut vec![0; SIMDJSON_PADDING]);
6-
let mut res = Vec::new();
7-
84
unsafe {
9-
Deserializer::_find_structural_bits::<S>(input.as_slice(), &mut res)
5+
let mut input = AlignedBuf::with_capacity(input_str.len() + SIMDINPUT_LENGTH);
6+
input
7+
.as_mut_ptr()
8+
.copy_from_nonoverlapping(input_str.as_bytes().as_ptr(), input_str.len());
9+
input
10+
.as_mut_ptr()
11+
.add(input_str.len())
12+
.write_bytes(0x20, SIMDINPUT_LENGTH);
13+
input.set_len(input_str.len() + SIMDINPUT_LENGTH);
14+
let mut res = Vec::new();
15+
16+
Deserializer::_find_structural_bits::<S>(&input, input_str.len(), &mut res)
1017
.expect("failed to find structural bits");
11-
};
12-
println!("{input_str}");
13-
assert_eq!(res, expected);
18+
19+
println!("{input_str}");
20+
assert_eq!(res, expected);
21+
}
1422
}
1523

1624
fn find_structural_bits_test_cases<S: Stage1Parse>() {
17-
test_find_structural_bits::<S>("", &[0]);
25+
// test_find_structural_bits::<S>("", &[0]);
1826
test_find_structural_bits::<S>("1", &[0]);
19-
test_find_structural_bits::<S>("[1]", &[0, 1, 2, 3]);
20-
test_find_structural_bits::<S>("[1, 2]", &[0, 1, 2, 4, 5, 6]);
27+
test_find_structural_bits::<S>("[1]", &[0, 1, 2]);
28+
test_find_structural_bits::<S>("[1, 2]", &[0, 1, 2, 4, 5]);
2129
test_find_structural_bits::<S>(
2230
r#"{
2331
"snot": "badger",
@@ -28,13 +36,13 @@ fn find_structural_bits_test_cases<S: Stage1Parse>() {
2836
&[
2937
0, 18, 24, 26, 34, 52, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
3038
78, 79, 80, 81, 82, 84, 85, 87, 88, 90, 92, 94, 96, 97, 111, 113, 132, 133, 134, 152,
31-
176, 178, 192, 210, 248, 250, 357, 358,
39+
176, 178, 192, 210, 248, 250, 357,
3240
],
3341
);
3442

3543
test_find_structural_bits::<S>(
3644
r#" { "hell\"o": 1 , "b": [ 1, 2, 3 ] }"#,
37-
&[1, 3, 12, 14, 16, 18, 21, 23, 25, 26, 28, 29, 31, 33, 35, 36],
45+
&[1, 3, 12, 14, 16, 18, 21, 23, 25, 26, 28, 29, 31, 33, 35],
3846
);
3947
}
4048

0 commit comments

Comments
 (0)