From 2ac5633ab1db560125da8cdd279d5ceaf6ae3ee0 Mon Sep 17 00:00:00 2001 From: flexatone Date: Tue, 22 Jul 2025 13:02:30 -0700 Subject: [PATCH] updated --- doc/articles/first_true_2d.py | 83 ++++++++--- src/lib.rs | 255 +++++++++++++++------------------- 2 files changed, 171 insertions(+), 167 deletions(-) diff --git a/doc/articles/first_true_2d.py b/doc/articles/first_true_2d.py index 70da0cc..5a49a7c 100644 --- a/doc/articles/first_true_2d.py +++ b/doc/articles/first_true_2d.py @@ -8,7 +8,9 @@ from itertools import repeat from arraykit import first_true_2d as ak_first_true_2d -from arrayredox import first_true_2d as ar_first_true_2d +from arrayredox import first_true_2d as ar_first_true_2d_a +from arrayredox import first_true_2d_b as ar_first_true_2d_b + import arraykit as ak import matplotlib.pyplot as plt @@ -58,33 +60,64 @@ def __call__(self): #------------------------------------------------------------------------------- -class ARFirstTrueAxis0Forward(ArrayProcessor): - NAME = 'ar.first_true_2d(forward=True, axis=0)' +class ARFirstTrueAAxis0Forward(ArrayProcessor): + NAME = 'ar.first_true_2d_a(forward=True, axis=0)' SORT = 10 def __call__(self): - _ = ar_first_true_2d(self.array, forward=True, axis=0) + _ = ar_first_true_2d_a(self.array, forward=True, axis=0) -class ARFirstTrueAxis1Forward(ArrayProcessor): - NAME = 'ar.first_true_2d(forward=True, axis=1)' +class ARFirstTrueAAxis1Forward(ArrayProcessor): + NAME = 'ar.first_true_2d_a(forward=True, axis=1)' SORT = 11 def __call__(self): - _ = ar_first_true_2d(self.array, forward=True, axis=1) + _ = ar_first_true_2d_a(self.array, forward=True, axis=1) -class ARFirstTrueAxis0Reverse(ArrayProcessor): - NAME = 'ar.first_true_2d(forward=False, axis=0)' +class ARFirstTrueAAxis0Reverse(ArrayProcessor): + NAME = 'ar.first_true_2d_a(forward=False, axis=0)' SORT = 12 def __call__(self): - _ = ar_first_true_2d(self.array, forward=False, axis=0) + _ = ar_first_true_2d_a(self.array, forward=False, axis=0) -class ARFirstTrueAxis1Reverse(ArrayProcessor): - NAME = 'ar.first_true_2d(forward=False, axis=1)' +class ARFirstTrueAAxis1Reverse(ArrayProcessor): + NAME = 'ar.first_true_2d_a(forward=False, axis=1)' SORT = 13 def __call__(self): - _ = ar_first_true_2d(self.array, forward=False, axis=1) + _ = ar_first_true_2d_a(self.array, forward=False, axis=1) + +#------------------------------------------------------------------------------- +class ARFirstTrueBAxis0Forward(ArrayProcessor): + NAME = 'ar.first_true_2d_b(forward=True, axis=0)' + SORT = 20 + + def __call__(self): + + _ = ar_first_true_2d_b(self.array, forward=True, axis=0) + +class ARFirstTrueBAxis1Forward(ArrayProcessor): + NAME = 'ar.first_true_2d_b(forward=True, axis=1)' + SORT = 21 + + def __call__(self): + _ = ar_first_true_2d_b(self.array, forward=True, axis=1) + +class ARFirstTrueBAxis0Reverse(ArrayProcessor): + NAME = 'ar.first_true_2d_b(forward=False, axis=0)' + SORT = 22 + + def __call__(self): + _ = ar_first_true_2d_b(self.array, forward=False, axis=0) + +class ARFirstTrueBAxis1Reverse(ArrayProcessor): + NAME = 'ar.first_true_2d_b(forward=False, axis=1)' + SORT = 23 + + def __call__(self): + _ = ar_first_true_2d_b(self.array, forward=False, axis=1) + #------------------------------------------------------------------------------- @@ -92,7 +125,7 @@ def __call__(self): class NPNonZero(ArrayProcessor): NAME = 'np.nonzero()' - SORT = 3 + SORT = 33 def __call__(self): x, y = np.nonzero(self.array) @@ -101,7 +134,7 @@ def __call__(self): class NPArgMaxAxis0(ArrayProcessor): NAME = 'np.any(axis=0), np.argmax(axis=0)' - SORT = 4 + SORT = 34 def __call__(self): _ = ~np.any(self.array, axis=0) @@ -109,7 +142,7 @@ def __call__(self): class NPArgMaxAxis1(ArrayProcessor): NAME = 'np.any(axis=1), np.argmax(axis=1)' - SORT = 4 + SORT = 35 def __call__(self): _ = ~np.any(self.array, axis=1) @@ -303,14 +336,20 @@ def get_versions() -> str: AKFirstTrueAxis0Reverse, AKFirstTrueAxis1Reverse, - ARFirstTrueAxis0Forward, - ARFirstTrueAxis1Forward, - ARFirstTrueAxis0Reverse, - ARFirstTrueAxis1Reverse, + ARFirstTrueAAxis0Forward, + ARFirstTrueAAxis1Forward, + ARFirstTrueAAxis0Reverse, + ARFirstTrueAAxis1Reverse, + + ARFirstTrueBAxis0Forward, + ARFirstTrueBAxis1Forward, + ARFirstTrueBAxis0Reverse, + ARFirstTrueBAxis1Reverse, + # NPNonZero, - # NPArgMaxAxis0, - # NPArgMaxAxis1 + NPArgMaxAxis0, + NPArgMaxAxis1 ) CLS_FF = ( diff --git a/src/lib.rs b/src/lib.rs index b6cce45..51ee685 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,8 +13,7 @@ use numpy::PyArrayMethods; use numpy::PyReadonlyArray2; use numpy::PyUntypedArrayMethods; -// use rayon::prelude::*; -// use rayon::ThreadPoolBuilder; +use rayon::prelude::*; use std::sync::Arc; #[pyfunction] @@ -350,49 +349,6 @@ fn first_true_1d(py: Python, array: PyReadonlyArray1, forward: bool) -> is // axis == 0: transpose, copy to C // axis == 1: copy to C -// fn prepare_array_for_axis<'py>( -// py: Python<'py>, -// array: PyReadonlyArray2<'py, bool>, -// axis: isize, -// ) -> PyResult>> { -// if axis != 0 && axis != 1 { -// return Err(PyValueError::new_err("axis must be 0 or 1")); -// } - -// let is_c = array.is_c_contiguous(); -// let is_f = array.is_fortran_contiguous(); -// let array_view = array.as_array(); - -// match (is_c, is_f, axis) { -// (true, _, 1) => { -// // Already C-contiguous, no copy needed -// Ok(array_view.to_pyarray(py).to_owned()) -// } -// (_, true, 0) => { -// // F-contiguous original -> transposed will be C-contiguous, no copy needed -// Ok(array_view.reversed_axes().to_pyarray(py).to_owned()) -// } -// (_, true, 1) => { -// // F-contiguous, need to copy to C-contiguous -// let contiguous = array_view.as_standard_layout(); -// Ok(contiguous.to_pyarray(py).to_owned()) -// } -// (_, _, 1) => { -// // Neither C nor F contiguous, need to copy -// let contiguous = array_view.as_standard_layout(); -// Ok(contiguous.to_pyarray(py).to_owned()) -// } - -// (true, _, 0) | (_, _, 0) => { -// // C-contiguous or neither -> transposed won't be C-contiguous, need copy -// let transposed = array_view.reversed_axes(); -// let contiguous = transposed.as_standard_layout(); -// Ok(contiguous.to_pyarray(py).to_owned()) -// } -// _ => unreachable!(), -// } -// } - pub struct PreparedBool2D<'py> { pub data: &'py [u8], // contiguous byte slice (bool as u8) pub nrows: usize, @@ -463,6 +419,10 @@ pub fn prepare_array_for_axis<'py>( }) } + + + + #[pyfunction] #[pyo3(signature = (array, *, forward=true, axis))] pub fn first_true_2d<'py>( @@ -553,111 +513,115 @@ pub fn first_true_2d<'py>( Ok(pyarray) } -// #[pyfunction] -// #[pyo3(signature = (array, *, forward=true, axis))] -// pub fn first_true_2d_b<'py>( -// py: Python<'py>, -// array: PyReadonlyArray2<'py, bool>, -// forward: bool, -// axis: isize, -// ) -> PyResult>> { -// let prepared = prepare_array_for_axis(array, axis)?; -// let data = prepared.data; -// let rows = prepared.nrows; -// let row_len = prepared.ncols; - -// let mut result = vec![-1isize; rows]; - -// // Dynamically select thread count -// let max_threads = if rows < 100 { -// 1 -// } else if rows < 1000 { -// 1 -// } else if rows < 10000 { -// 1 -// } else { -// 16 -// }; -// py.allow_threads(|| { -// let base_ptr = data.as_ptr() as usize; -// const LANES: usize = 32; -// let ones = u8x32::splat(1); +#[pyfunction] +#[pyo3(signature = (array, *, forward=true, axis))] +pub fn first_true_2d_b<'py>( + py: Python<'py>, + array: PyReadonlyArray2<'py, bool>, + forward: bool, + axis: isize, +) -> PyResult>> { + let prepared = prepare_array_for_axis(array, axis)?; + let data = prepared.data; + let rows = prepared.nrows; + let row_len = prepared.ncols; + + let mut result = vec![-1isize; rows]; -// let process_row = |row: usize| -> isize { -// let ptr = (base_ptr + row * row_len) as *const u8; -// let mut found = -1isize; + // Dynamically select thread count + let max_threads = if rows < 100 { + 2 + } else if rows < 1000 { + 4 + } else if rows < 10000 { + 8 + } else { + 16 + }; + + py.allow_threads(|| { + let base_ptr = data.as_ptr() as usize; + const LANES: usize = 32; + let ones = u8x32::splat(1); + + let process_row = |row: usize| -> isize { + let ptr = (base_ptr + row * row_len) as *const u8; + let mut found = -1isize; + + unsafe { + if forward { + let mut i = 0; + while i + LANES <= row_len { + let chunk = &*(ptr.add(i) as *const [u8; LANES]); + let vec = u8x32::from(*chunk); + if vec.cmp_eq(ones).any() { + break; + } + i += LANES; + } + while i < row_len { + if *ptr.add(i) != 0 { + found = i as isize; + break; + } + i += 1; + } + } else { + let mut i = row_len; + while i >= LANES { + i -= LANES; + let chunk = &*(ptr.add(i) as *const [u8; LANES]); + let vec = u8x32::from(*chunk); + if vec.cmp_eq(ones).any() { + for j in (i..i + LANES).rev() { + if *ptr.add(j) != 0 { + found = j as isize; + break; + } + } + break; + } + } + if i > 0 && i < LANES { + for j in (0..i).rev() { + if *ptr.add(j) != 0 { + found = j as isize; + break; + } + } + } + } + } + + found + }; + + if max_threads == 1 { + // Single-threaded path + for row in 0..rows { + result[row] = process_row(row); + } + } else { + // Multi-threaded path with Rayon + let pool = rayon::ThreadPoolBuilder::new() + .num_threads(max_threads) + .build() + .unwrap(); + + pool.install(|| { + result.par_iter_mut().enumerate().for_each(|(row, out)| { + *out = process_row(row); + }); + }); + } + }); + + Ok(PyArray1::from_vec(py, result)) +} -// unsafe { -// if forward { -// let mut i = 0; -// while i + LANES <= row_len { -// let chunk = &*(ptr.add(i) as *const [u8; LANES]); -// let vec = u8x32::from(*chunk); -// if vec.cmp_eq(ones).any() { -// break; -// } -// i += LANES; -// } -// while i < row_len { -// if *ptr.add(i) != 0 { -// found = i as isize; -// break; -// } -// i += 1; -// } -// } else { -// let mut i = row_len; -// while i >= LANES { -// i -= LANES; -// let chunk = &*(ptr.add(i) as *const [u8; LANES]); -// let vec = u8x32::from(*chunk); -// if vec.cmp_eq(ones).any() { -// for j in (i..i + LANES).rev() { -// if *ptr.add(j) != 0 { -// found = j as isize; -// break; -// } -// } -// break; -// } -// } -// if i > 0 && i < LANES { -// for j in (0..i).rev() { -// if *ptr.add(j) != 0 { -// found = j as isize; -// break; -// } -// } -// } -// } -// } -// found -// }; -// if max_threads == 1 { -// // Single-threaded path -// for row in 0..rows { -// result[row] = process_row(row); -// } -// } else { -// // Multi-threaded path with Rayon -// let pool = rayon::ThreadPoolBuilder::new() -// .num_threads(max_threads) -// .build() -// .unwrap(); - -// pool.install(|| { -// result.par_iter_mut().enumerate().for_each(|(row, out)| { -// *out = process_row(row); -// }); -// }); -// } -// }); - -// Ok(PyArray1::from_vec(py, result)) -// } //------------------------------------------------------------------------------ @@ -671,5 +635,6 @@ fn arrayredox(m: &Bound<'_, PyModule>) -> PyResult<()> { // m.add_function(wrap_pyfunction!(first_true_1d_g, m)?)?; m.add_function(wrap_pyfunction!(first_true_1d, m)?)?; m.add_function(wrap_pyfunction!(first_true_2d, m)?)?; + m.add_function(wrap_pyfunction!(first_true_2d_b, m)?)?; Ok(()) }