Optimize partition_validity function used in sort kernels (apache#7937)

jhorstmann · web-flow · commit 233dad39b65b · 2025-07-18T07:55:56.000-04:00
# Which issue does this PR close? Optimize `partition_validity` function used in sort kernels - Preallocate vectors based on known null counts - Avoid dynamic dispatch by calling `NullBuffer::is_valid` instead of `Array::is_valid` - Avoid capacity checks inside loop by writing to `spare_capacity_mut` instead of using `push` - Closes apache#7936. # Rationale for this change Microbenchmark results for `sort_kernels` compared to `main`, only looking at benchmarks matching "nulls to indices": ``` sort i32 nulls to indices 2^10 time: [4.9325 µs 4.9370 µs 4.9422 µs] change: [−20.303% −20.133% −19.974%] (p = 0.00 < 0.05) Performance has improved. sort i32 nulls to indices 2^12 time: [20.096 µs 20.209 µs 20.327 µs] change: [−26.819% −26.275% −25.697%] (p = 0.00 < 0.05) Performance has improved. sort f32 nulls to indices 2^12 time: [26.329 µs 26.366 µs 26.406 µs] change: [−29.487% −29.331% −29.146%] (p = 0.00 < 0.05) Performance has improved. sort string[0-10] nulls to indices 2^12 time: [70.667 µs 70.762 µs 70.886 µs] change: [−20.057% −19.935% −19.819%] (p = 0.00 < 0.05) Performance has improved. sort string[0-100] nulls to indices 2^12 time: [101.98 µs 102.44 µs 102.99 µs] change: [−0.3501% +0.0835% +0.4918%] (p = 0.71 > 0.05) No change in performance detected. sort string[0-400] nulls to indices 2^12 time: [84.952 µs 85.024 µs 85.102 µs] change: [−5.3969% −4.9827% −4.6421%] (p = 0.00 < 0.05) Performance has improved. sort string[10] nulls to indices 2^12 time: [72.486 µs 72.664 µs 72.893 µs] change: [−14.937% −14.781% −14.599%] (p = 0.00 < 0.05) Performance has improved. sort string[100] nulls to indices 2^12 time: [71.354 µs 71.606 µs 71.902 µs] change: [−17.207% −16.795% −16.373%] (p = 0.00 < 0.05) Performance has improved. sort string[1000] nulls to indices 2^12 time: [73.088 µs 73.195 µs 73.311 µs] change: [−16.705% −16.599% −16.483%] (p = 0.00 < 0.05) Performance has improved. sort string_view[10] nulls to indices 2^12 time: [32.592 µs 32.654 µs 32.731 µs] change: [−15.722% −15.512% −15.310%] (p = 0.00 < 0.05) Performance has improved. sort string_view[0-400] nulls to indices 2^12 time: [32.981 µs 33.074 µs 33.189 µs] change: [−25.570% −25.132% −24.700%] (p = 0.00 < 0.05) Performance has improved. sort string_view_inlined[0-12] nulls to indices 2^12 time: [28.467 µs 28.496 µs 28.529 µs] change: [−22.978% −22.786% −22.574%] (p = 0.00 < 0.05) Performance has improved. sort string[10] dict nulls to indices 2^12 time: [94.463 µs 94.503 µs 94.542 µs] change: [−11.386% −11.165% −10.961%] (p = 0.00 < 0.05) Performance has improved. ``` # Are these changes tested? Covered by existing tests # Are there any user-facing changes? No, the method is internal to the sort kernels.
diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs
@@ -180,13 +180,41 @@ where
 
 // partition indices into valid and null indices
 fn partition_validity(array: &dyn Array) -> (Vec<u32>, Vec<u32>) {
-    match array.null_count() {
-        // faster path
-        0 => ((0..(array.len() as u32)).collect(), vec![]),
-        _ => {
-            let indices = 0..(array.len() as u32);
-            indices.partition(|index| array.is_valid(*index as usize))
+    let len = array.len();
+    let null_count = array.null_count();
+    match array.nulls() {
+        Some(nulls) if null_count > 0 => {
+            let mut valid_indices = Vec::with_capacity(len - null_count);
+            let mut null_indices = Vec::with_capacity(null_count);
+
+            let valid_slice = valid_indices.spare_capacity_mut();
+            let null_slice = null_indices.spare_capacity_mut();
+            let mut valid_idx = 0;
+            let mut null_idx = 0;
+
+            nulls.into_iter().enumerate().for_each(|(i, v)| {
+                if v {
+                    valid_slice[valid_idx].write(i as u32);
+                    valid_idx += 1;
+                } else {
+                    null_slice[null_idx].write(i as u32);
+                    null_idx += 1;
+                }
+            });
+
+            assert_eq!(null_idx, null_count);
+            assert_eq!(valid_idx, len - null_count);
+            // Safety: The new lengths match the initial capacity as asserted above,
+            // the bounds checks while writing also ensure they less than or equal to the capacity.
+            unsafe {
+                valid_indices.set_len(valid_idx);
+                null_indices.set_len(null_idx);
+            }
+
+            (valid_indices, null_indices)
         }
+        // faster path
+        _ => ((0..(len as u32)).collect(), vec![]),
     }
 }