diff --git a/.gitignore b/.gitignore index e6a47305..384df630 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,7 @@ __pycache__ build *.diff *.orig +callgrind* +*.dot +*.out +*.prof diff --git a/debug.py b/debug.py new file mode 100644 index 00000000..9387e2a1 --- /dev/null +++ b/debug.py @@ -0,0 +1,202 @@ +from arraykit import array_to_duplicated_hashable +import numpy as np + +class PO: + def __init__(self, v) -> None: + self.v = v + def __repr__(self) -> str: + return f'PO<{self.v}>' + + +def handle_value_one_boundary(i, value, is_dup, set_obj, dict_obj): + seen = set_obj + assert dict_obj == None + + if value not in seen: + seen.add(value) + else: + is_dup[i] = True + + +def handle_value_exclude_boundaries(i, value, is_dup, set_obj, dict_obj): + duplicates = set_obj + first_unique_locations = dict_obj + + if value not in first_unique_locations: + first_unique_locations[value] = i + else: + is_dup[i] = True + + # Second time seeing a duplicate + if value not in duplicates: + is_dup[first_unique_locations[value]] = True + + # always update last + duplicates.add(value) + + +def handle_value_include_boundaries(i, value, is_dup, set_obj, dict_obj): + seen = set_obj + last_duplicate_locations = dict_obj + + if value not in seen: + seen.add(value) + else: + is_dup[i] = True + + # always update last + last_duplicate_locations[value] = i + + +def iterate_1d(array, axis, reverse, is_dup, process_value_func, set_obj, dict_obj): + if reverse: + iterator = reversed(array) + else: + iterator = array + + size = len(array) + + for i, value in enumerate(iterator): + if reverse: + i = size - i - 1 + + process_value_func(i, value, is_dup, set_obj, dict_obj) + + +def iterate_2d(array, axis, reverse, is_dup, process_value_func, set_obj, dict_obj): + size = array.shape[axis] + + if axis == 0: + iterator = array + else: + iterator = array.T + + if reverse: + iterator = reversed(iterator) + + for i, value in enumerate(map(tuple, iterator)): + if reverse: + i = size - i - 1 + + process_value_func(i, value, is_dup, set_obj, dict_obj) + + +def python_impl( + array: np.ndarray, + axis: int = 0, + exclude_first: bool = False, + exclude_last: bool = False, + ) -> np.ndarray: + ''' + Algorithm for finding duplicates in unsortable arrays for hashables. This will always be an object array. + + Note: + np.unique fails under the same conditions that sorting fails, so there is no need to try np.unique: must go to set drectly. + ''' + size = array.shape[axis] + + reverse = not exclude_first and exclude_last + + if array.ndim == 1: + iterate_func = iterate_1d + else: + iterate_func = iterate_2d + + is_dup = np.full(size, False) + + set_obj = set() + if exclude_first ^ exclude_last: + dict_obj = None + process_value_func = handle_value_one_boundary + + elif not exclude_first and not exclude_last: + dict_obj = dict() + process_value_func = handle_value_exclude_boundaries + + else: + dict_obj = dict() + process_value_func = handle_value_include_boundaries + + iterate_func(array, axis, reverse, is_dup, process_value_func, set_obj, dict_obj) + + if exclude_first and exclude_last: + is_dup[list(dict_obj.values())] = False + + return is_dup + + +def dprint(*args, debug): + '''Debug print''' + if debug: + print(*args) + + +def run_test(array, debug=True): + def _test(*args): + dprint(args[1:], debug=debug) + + python_result = python_impl(*args) + dprint('python:', python_result, debug=debug) + + c_result = array_to_duplicated_hashable(*args); + dprint('c :', c_result, debug=debug) + assert (python_result == c_result).all() + + _test(array, 0, True, False) # include_boundaries + _test(array, 0, False, False) # one_boundary (normal) + _test(array, 0, False, True) # one_boundary (reverse) + _test(array, 0, True, True) # exclude_boundaries + + if len(array.shape) == 2: + _test(array, 1, True, False) + _test(array, 1, False, False) + _test(array, 1, False, True) + _test(array, 1, True, True) + + +def test_arr1d(debug=True): + arr = np.array([1, 2, 2, 1, 3, 2, 6], dtype=object) + + # Test with normally constructed array + run_test(arr, debug=debug) + + arr2d = np.array([[2, 1, 2], + [3, 2, 3], + [3, 2, 3], + [2, 1, 2], + [4, 3, 4], + [3, 2, 3], + [6, 6, 6]], dtype=object) + + # Test with array slices + run_test(arr2d[:, 1], debug=debug) + run_test(arr2d.T[1], debug=debug) + + +def test_arr2d(debug=True): + arr2d = np.array([ + [1, 2, 2, 1, 3, 2, 6], + [2, 3, 3, 2, 4, 3, 6], + [2, 3, 3, 2, 4, 3, 6], + [1, 2, 2, 1, 3, 2, 6], + [3, 4, 4, 3, 5, 4, 6], + [2, 3, 3, 2, 4, 3, 6], + ], dtype=object) + + run_test(arr2d, debug=debug) + run_test(arr2d.T, debug=debug) + + +def test_misc(debug=True): + arr = np.array([1, PO(1), 2, 3, 1, PO(1), 2, 3, 2, -1, -233, 'aslkj', 'df', 'df', True, True, None, 1]) + run_test(arr, debug=debug) + + arr = np.arange(20).reshape(4, 5).astype(object) + run_test(arr, debug=debug) + run_test(arr.T, debug=debug) + + +test_arr1d(debug=False) +test_arr2d(debug=False) +test_misc(debug=False) +print('Done') diff --git a/performance/__main__.py b/performance/__main__.py index 9e31dc4d..aeb08935 100644 --- a/performance/__main__.py +++ b/performance/__main__.py @@ -1,7 +1,8 @@ +import argparse import collections import datetime +import itertools import timeit -import argparse import numpy as np @@ -17,6 +18,7 @@ from performance.reference.util import dtype_from_element as dtype_from_element_ref from performance.reference.util import array_deepcopy as array_deepcopy_ref from performance.reference.util import isna_element as isna_element_ref +from performance.reference.util import array_to_duplicated_hashable as array_to_duplicated_hashable_ref from performance.reference.array_go import ArrayGO as ArrayGOREF @@ -32,6 +34,7 @@ from arraykit import dtype_from_element as dtype_from_element_ak from arraykit import array_deepcopy as array_deepcopy_ak from arraykit import isna_element as isna_element_ak +from performance.reference.util import array_to_duplicated_hashable as array_to_duplicated_hashable_ak from arraykit import ArrayGO as ArrayGOAK @@ -359,6 +362,81 @@ class IsNaElementPerfREF(IsNaElementPerf): entry = staticmethod(isna_element_ref) +#------------------------------------------------------------------------------- +class ArrayToDuplicatedHashablePerf(Perf): + NUMBER = 3 + FUNCTIONS = ( + 'array_1d_small', + 'array_1d_large', + 'array_2d_small', + 'array_2d_large', + ) + + def __init__(self): + self.arrays_1d_small = [ + np.array([0,0,1,0,None,None,0,1,None], dtype=object), + np.array([0,0,1,0,'q','q',0,1,'q'], dtype=object), + np.array(['q','q','q', 'a', 'w', 'w'], dtype=object), + np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object), + ] + + # 0.99920089 0.94194469 + rs = np.random.RandomState(0) + self.arrays_1d_large = [ + np.arange(100_000).astype(object), # All unique 0.73636183 0.73142613 + np.full(100_000, fill_value='abc').astype(object), # All duplicated 0.99341718 1.07130567 + rs.randint(0, 100, 100_000).astype(object), # Many repeated elements from small subset 0.96812477 0.97921523 + rs.randint(0, 10_000, 100_000).astype(object), # Many repeated elements from medium subset 1.05508269 0.9765244 + rs.randint(0, 75_000, 100_000).astype(object), # Some repeated elements from a large subset 0.81474696 0.89746359 + np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).astype(object), # Custom 0.84165586 0.86117453 + ] + + self.arrays_2d_small = [ + np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object), + np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object), + np.array([[50, 50, 32, 17, 17], [2,2,1,3,3]], dtype=object), + ] + self.arrays_2d_large = [ + np.arange(100_000).reshape(10_000, 10).astype(object), + np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).reshape(10_000, 10).astype(object), + ] + + def array_1d_small(self): + for _ in range(10000): + for arr in self.arrays_1d_small: + self.entry(arr, 0, False, False) + self.entry(arr, 0, True, False) + self.entry(arr, 0, False, True) + + def array_1d_large(self): + for _ in range(5): + for arr in self.arrays_1d_large: + self.entry(arr, 0, False, False) + self.entry(arr, 0, True, False) + self.entry(arr, 0, False, True) + + def array_2d_small(self): + for _ in range(5000): + for axis, arr in itertools.product((0, 1), self.arrays_2d_small): + self.entry(arr, axis, False, False) + self.entry(arr, axis, True, False) + self.entry(arr, axis, False, True) + + def array_2d_large(self): + for _ in range(12): + for axis, arr in itertools.product((0, 1), self.arrays_2d_large): + self.entry(arr, axis, False, False) + self.entry(arr, axis, True, False) + self.entry(arr, axis, False, True) + + +class ArrayToDuplicatedHashablePerfAK(ArrayToDuplicatedHashablePerf): + entry = staticmethod(array_to_duplicated_hashable_ak) + +class ArrayToDuplicatedHashablePerfREF(ArrayToDuplicatedHashablePerf): + entry = staticmethod(array_to_duplicated_hashable_ref) + + #------------------------------------------------------------------------------- def get_arg_parser(): @@ -399,7 +477,7 @@ def main(): number=cls_runner.NUMBER) records.append((cls_perf.__name__, func_attr, results['ak'], results['ref'], results['ref'] / results['ak'])) - width = 24 + width = 36 for record in records: print(''.join( (r.ljust(width) if isinstance(r, str) else str(round(r, 8)).ljust(width)) for r in record diff --git a/performance/reference/util.py b/performance/reference/util.py index 0f2d0efc..167de3ba 100644 --- a/performance/reference/util.py +++ b/performance/reference/util.py @@ -216,3 +216,63 @@ def dtype_from_element(value: tp.Optional[tp.Hashable]) -> np.dtype: # NOTE: calling array and getting dtype on np.nan is faster than combining isinstance, isnan calls return np.array(value).dtype + +#------------------------------------------------------------------------------- +# tools for handling duplicates + +def array_to_duplicated_hashable( + array: np.ndarray, + axis: int = 0, + exclude_first: bool = False, + exclude_last: bool = False, + ) -> np.ndarray: + ''' + Algorithm for finding duplicates in unsortable arrays for hashables. This will always be an object array. + ''' + # np.unique fails under the same conditions that sorting fails, so there is no need to try np.unique: must go to set drectly. + len_axis = array.shape[axis] + + if array.ndim == 1: + value_source = array + to_hashable = None + else: + if axis == 0: + value_source = array # will iterate rows + else: + value_source = (array[:, i] for i in range(len_axis)) + # values will be arrays; must convert to tuples to make hashable + to_hashable = tuple + + is_dupe = np.full(len_axis, False) + + # could exit early with a set, but would have to hash all array twice to go to set and dictionary + # creating a list for each entry and tracking indices would be very expensive + + unique_to_first: tp.Dict[tp.Hashable, int] = {} # value to first occurence + dupe_to_first: tp.Dict[tp.Hashable, int] = {} + dupe_to_last: tp.Dict[tp.Hashable, int] = {} + + for idx, v in enumerate(value_source): + + if to_hashable: + v = to_hashable(v) + + if v not in unique_to_first: + unique_to_first[v] = idx + else: + # v has been seen before; upate Boolean array + is_dupe[idx] = True + + # if no entry in dupe to first, no update with value in unique to first, which is the index this values was first seen + if v not in dupe_to_first: + dupe_to_first[v] = unique_to_first[v] + # always update last + dupe_to_last[v] = idx + + if exclude_last: # overwrite with False + is_dupe[list(dupe_to_last.values())] = False + + if not exclude_first: # add in first values + is_dupe[list(dupe_to_first.values())] = True + + return is_dupe diff --git a/profile.py b/profile.py new file mode 100644 index 00000000..0c849fa9 --- /dev/null +++ b/profile.py @@ -0,0 +1,70 @@ +import numpy as np +import sys +from arraykit import array_to_duplicated_hashable + + +def main(setup): + if setup == 'small_1d': + ITERATIONS = 5_000 + axes = (0,) + arrays = [ + np.array([0,0,1,0,None,None,0,1,None], dtype=object), + np.array([0,0,1,0,'q','q',0,1,'q'], dtype=object), + np.array(['q','q','q', 'a', 'w', 'w'], dtype=object), + np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object), + ] + + elif setup == 'large_1d': + ITERATIONS = 10 + axes = (0,) + + rs = np.random.RandomState(0) + arrays = [ + np.arange(100_000).astype(object), # All unique + np.full(100_000, fill_value='abc').astype(object), # All duplicated + rs.randint(0, 100, 100_000).astype(object), # Many repeated elements from small subset + rs.randint(0, 10_000, 100_000).astype(object), # Many repeated elements from medium subset + rs.randint(0, 75_000, 100_000).astype(object), # Some repeated elements from a large subset + np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).astype(object), # Custom + ] + + elif setup == 'small_2d': + ITERATIONS = 5_000 + axes = (0, 1) + arrays = [ + np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object), + np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object), + np.array([[50, 50, 32, 17, 17], [2,2,1,3,3]], dtype=object), + ] + + elif setup == 'large_2d': + ITERATIONS = 10 + axes = (0, 1) + arrays = [ + np.arange(100_000).reshape(10_000, 10).astype(object), + np.hstack([np.arange(15), np.arange(90_000), np.arange(15), np.arange(9970)]).reshape(10_000, 10).astype(object), + ] + + else: + assert False, "Impossible state!" + + for _ in range(ITERATIONS): + for arr in arrays: + for axis in axes: + array_to_duplicated_hashable(arr, axis, True, False) + array_to_duplicated_hashable(arr, axis, False, True) + array_to_duplicated_hashable(arr, axis, False, False) + + +if __name__ == '__main__': + try: + setup = sys.argv[1] + assert setup in ('small_1d', 'large_1d', 'small_2d', 'large_2d') + except IndexError: + print('Expected a setup arg!') + sys.exit(1) + except AssertionError: + print(f'Invalid setup arg: {setup}') + sys.exit(1) + + main(setup) diff --git a/setup.py b/setup.py index d3062396..7a293c33 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,8 @@ def get_long_description() -> str: ak_extension = Extension( name='arraykit._arraykit', # build into module sources=['src/_arraykit.c'], + #extra_link_args=['-Wl,--no-as-needed,-lprofiler,--as-needed'], # Uncomment this to use gperftools + #extra_compile_args = ["-O0"], # Uncomment this to provide more debug symbols. **additional_info, ) diff --git a/src/__init__.py b/src/__init__.py index 988ca110..b113f124 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -16,3 +16,4 @@ from ._arraykit import resolve_dtype_iter as resolve_dtype_iter from ._arraykit import isna_element as isna_element from ._arraykit import dtype_from_element as dtype_from_element +from ._arraykit import array_to_duplicated_hashable as array_to_duplicated_hashable diff --git a/src/__init__.pyi b/src/__init__.pyi index 4ff12eb9..b839dc44 100644 --- a/src/__init__.pyi +++ b/src/__init__.pyi @@ -32,4 +32,9 @@ def resolve_dtype(__d1: np.dtype, __d2: np.dtype) -> np.dtype: ... def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ... def isna_element(__value: tp.Any) -> bool: ... def dtype_from_element(__value: tp.Optional[tp.Hashable]) -> np.dtype: ... - +def array_to_duplicated_hashable( + array: np.ndarray, + axis: int = 0, + exclude_first: bool = False, + exclude_last: bool = False, + ) -> np.ndarray: ... diff --git a/src/_arraykit.c b/src/_arraykit.c index f8906a5c..5ee9069a 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -63,6 +63,11 @@ fprintf(stderr, #msg); \ _AK_DEBUG_END() +# define AK_DEBUG_INT(msg) \ + _AK_DEBUG_BEGIN(); \ + fprintf(stderr, #msg"=%x", (int)(msg)); \ + _AK_DEBUG_END() + # if defined __GNUC__ || defined __clang__ # define AK_LIKELY(X) __builtin_expect(!!(X), 1) # define AK_UNLIKELY(X) __builtin_expect(!!(X), 0) @@ -490,6 +495,472 @@ isna_element(PyObject *Py_UNUSED(m), PyObject *arg) Py_RETURN_FALSE; } +//------------------------------------------------------------------------------ +// duplication + +// Defines how to process a hashable value. +typedef int (*AK_handle_value_func)(Py_ssize_t i, + PyObject* value, + npy_bool* is_dup, + PyObject* set_obj, + PyObject* dict_obj +); + +// Defines how to iterate over an arbitrary numpy (object) array +typedef int (*AK_iterate_np_func)(PyArrayObject* array, + int axis, + int reverse, + npy_bool* is_dup, + AK_handle_value_func handle_value_func, + PyObject* set_obj, + PyObject* dict_obj +); + +// Value processing funcs + +static int +AK_handle_value_one_boundary(Py_ssize_t i, PyObject *value, npy_bool *is_dup, + PyObject *seen, PyObject * Py_UNUSED(dict_obj)) +{ + /* + Used when the first duplicated element is considered unique. + + If exclude_first && !exclude_last, we walk from left to right + If !exclude_first && exclude_last, we walk from right to left + + Rougly equivalent Python: + + if value not in seen: + seen.add(value) + else: + is_dup[i] = True + */ + int found = PySet_Contains(seen, value); + if (found == -1) { + return -1; + } + + if (found == 0) { + return PySet_Add(seen, value); // -1 on failure, 0 on success + } + + is_dup[i] = NPY_TRUE; + return 0; +} + +static int +AK_handle_value_include_boundaries(Py_ssize_t i, PyObject *value, npy_bool *is_dup, + PyObject *seen, + PyObject *last_duplicate_locations) +{ + /* + Used when the first & last instances of duplicated values are considered unique + + Rougly equivalent Python: + + if value not in seen: + seen.add(value) + else: + is_dup[i] = True + + # Keep track of last observed location, so we can mark it False (i.e. unique) at the end + last_duplicate_locations[value] = i + */ + int found = PySet_Contains(seen, value); + if (found == -1) { + return -1; + } + + if (found == 0) { + return PySet_Add(seen, value); // -1 on failure, 0 on success + } + + is_dup[i] = NPY_TRUE; + + PyObject *idx = PyLong_FromLong(i); + if (!idx) { return -1; } + + int success = PyDict_SetItem(last_duplicate_locations, value, idx); + if (success == -1) { + Py_DECREF(idx); + } + return success; // -1 on failure, 0 on success +} + +static int +AK_handle_value_exclude_boundaries(Py_ssize_t i, PyObject *value, npy_bool *is_dup, + PyObject *duplicates, + PyObject *first_unique_locations) +{ + /* + Used when the first & last instances of duplicated values are considered duplicated + + Rougly equivalent Python: + + if value not in first_unique_locations: + # Keep track of the first time we see each unique value, so we can mark the first location + # of each duplicated value as duplicated + first_unique_locations[value] = i + else: + is_dup[i] = True + + # The second time we see a duplicate, we mark the first observed location as True (i.e. duplicated) + if value not in duplicates: + is_dup[first_unique_locations[value]] = True + + # This value is duplicated! + duplicates.add(value) + */ + int found = PyDict_Contains(first_unique_locations, value); + if (found == -1) { + return -1; + } + + if (found == 0) { + PyObject *idx = PyLong_FromLong(i); + if (!idx) { + return -1; + } + + int success = PyDict_SetItem(first_unique_locations, value, idx); + if (success == -1) { + Py_DECREF(idx); + } + return success; // -1 on failure, 0 on success + } + + is_dup[i] = NPY_TRUE; + + // Second time seeing a duplicate + found = PySet_Contains(duplicates, value); + if (found == -1) { + return -1; + } + + if (found == 0) { + PyObject *first_unique_location = PyDict_GetItem(first_unique_locations, value); // Borrowed! + if (!first_unique_location) { + return -1; + } + long idx = PyLong_AsLong(first_unique_location); + if (idx == -1) { + return -1; // -1 always means failure since no locations are negative + } + is_dup[idx] = NPY_TRUE; + } + + return PySet_Add(duplicates, value); +} + +// Iteration funcs + +static int +AK_iter_1d_array(PyArrayObject *array, int axis, int reverse, npy_bool *is_dup, + AK_handle_value_func value_func, PyObject *set_obj, PyObject *dict_obj) +{ + /* + Iterates over a 1D numpy array. + + Roughly equivalent Python code: + + if reverse: + iterator = reversed(array) + else: + iterator = array + + size = len(array) + + for i, value in enumerate(iterator): + if reverse: + i = size - i - 1 + + process_value_func(i, value, is_dup, set_obj, dict_obj) + */ + assert(axis == 0); + NpyIter *iter = NpyIter_New(array, + NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK, + NPY_KEEPORDER, + NPY_NO_CASTING, + NULL); + if (!iter) { goto failure; } + + NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); + if (!iternext) { goto failure; } + + char** dataptr = NpyIter_GetDataPtrArray(iter); + npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); + npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); + + // Do-while numpy iteration loops only happen once for 1D arrays! + do { + char *data = *dataptr; + npy_intp stride = *strideptr; + npy_intp count = *sizeptr; + + PyObject* value = NULL; + + Py_ssize_t i = 0; + int step = 1; + int stride_step = (int)stride; // We might walk in reverse! + + if (reverse) { + data += (stride * (count - 1)); + i = count - 1; + step = -1; + stride_step = -stride_step; + } + + while (count--) { + // Object arrays contains pointers to PyObjects, so we will only temporarily + // look at the reference here. + memcpy(&value, data, sizeof(value)); + + // Process the value! + if (value_func(i, value, is_dup, set_obj, dict_obj) == -1) { + goto failure; + } + + i += step; + data += stride_step; + } + } while (iternext(iter)); + + NpyIter_Deallocate(iter); + return 0; + +failure: + if (iter != NULL) { + NpyIter_Deallocate(iter); + } + return -1; +} + +static int +AK_iter_2d_array(PyArrayObject *array, int axis, int reverse, npy_bool *is_dup, + AK_handle_value_func value_func, PyObject *set_obj, PyObject *dict_obj) +{ + int is_c_order = PyArray_FLAGS(array) & NPY_ARRAY_C_CONTIGUOUS; + + // When the axis aligns with the ordering (i.e. row-wise for C, col-wise for Fortran), it means the npy iterator goes one-element at a time. + // Otherwise, it does a strided loop through the non-contiguous axis (which adds a lot of complexity). + // To prevent this, we will make a copy of the array with the data laid out in the way we want + if (is_c_order == axis) { + int new_flags = NPY_ARRAY_ALIGNED; + if (is_c_order) { + new_flags |= NPY_ARRAY_F_CONTIGUOUS; + } + else { + new_flags |= NPY_ARRAY_C_CONTIGUOUS; + } + + array = (PyArrayObject*)PyArray_FromArray(array, PyArray_DescrFromType(NPY_OBJECT), new_flags); + if (!array) { + return -1; + } + } + + int iter_flags = NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK; + int order_flags = NPY_FORTRANORDER ? axis : NPY_CORDER; + + NpyIter *iter = NpyIter_New(array, iter_flags, order_flags, NPY_NO_CASTING, NULL); + if (!iter) { goto failure; } + + NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); + if (!iternext) { goto failure; } + + char** dataptr = NpyIter_GetDataPtrArray(iter); + npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); + + npy_intp tuple_size = PyArray_DIM(array, !axis); + npy_intp num_tuples = PyArray_DIM(array, axis); + + do { + char *data = *dataptr; + npy_intp stride = *strideptr; + + PyObject* value = NULL; + + Py_ssize_t tup_idx = 0; + int step = 1; + int tup_stride_step = 0; // For normal iterations, each time we build a tuple, we are right where we + // we need to be to start building the next tuple. For reverse, we have to + // backtrack two tuples worth of strides to get where we need to be + + if (reverse) { + data += (stride * (num_tuples - 1) * tuple_size); + tup_idx = num_tuples - 1; + step = -1; + tup_stride_step = -(tuple_size * 2) * stride; + } + + while (num_tuples--) { + + PyObject *tup = PyTuple_New(tuple_size); + if (!tup) { goto failure; } + + for (int j = 0; j < tuple_size; ++j) { + memcpy(&value, data, sizeof(value)); + Py_INCREF(value); + PyTuple_SET_ITEM(tup, j, value); + data += stride; + } + + int success = value_func(tup_idx, tup, is_dup, set_obj, dict_obj); + Py_DECREF(tup); + if (success == -1) { goto failure; } + tup_idx += step; + data += tup_stride_step; + } + + } while (iternext(iter)); + + NpyIter_Deallocate(iter); + return 0; + +failure: + if (iter != NULL) { + NpyIter_Deallocate(iter); + } + return -1; +} + +static PyObject * +array_to_duplicated_hashable(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) +{ + /* + Main driver method. Determines how to iterate, and process the value of each iteration + based on the array itself and the parameters. + + Numpy 2D iteration is very different than Numpy 1D iteration, and so those two iteration + approaches are generalized. + + Depending on the parameters, there are 4 different ways we can interpret uniqueness. + + 1. exclude_first=True and exclude_last=True + - This means the first & last observations of duplicated values are considered unique. + - We consider them `included` in what is reported as unique + + 2. exclude_first=False and exclude_last=False + - This means the first & last observations of duplicated values are considered duplicated. + - We consider them `excluded` in what is reported as unique (by reporting them as duplicates) + + 3. exclude_first ^ exclude_last + - This means either the first OR the last observation will be considered unique, while the other is not + - This allows for more efficient iteration, by requiring only that we keep track of what we've seen before, + only changing the direction we iterate through the array. + + - If exclude_first is True, the we iterate left-to-right, ensuring the first observation of each unique + is reported as such, with every subsequent duplicate observation being marked as a duplicate + + - If exclude_last is True, the we iterate right-to-left, ensuring the last observation of each unique + is reported as such, with every subsequent duplicate observation being marked as a duplicate + */ + PyArrayObject *array = NULL; + int axis = 0; + int exclude_first = 0; + int exclude_last = 0; + + static char *kwarg_list[] = {"array", "axis", "exclude_first", "exclude_last", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, + "O!|iii:array_to_duplicated_hashable", kwarg_list, + &PyArray_Type, &array, + &axis, + &exclude_first, + &exclude_last)) + { + return NULL; + } + + if (PyArray_DESCR(array)->kind != 'O') { + PyErr_SetString(PyExc_ValueError, "Array must have object dtype"); + return NULL; + } + + int ndim = PyArray_NDIM(array); + if (axis > 1 || (ndim == 1 && axis == 1)) { + PyErr_SetString(PyExc_ValueError, "Axis must be 0 or 1 for 2d, and 0 for 1d"); + return NULL; + } + + int size = PyArray_DIM(array, axis); + int reverse = !exclude_first && exclude_last; + + AK_handle_value_func handle_value_func = NULL; + AK_iterate_np_func iterate_array_func = NULL; + + // 1. Determine how to iterate + if (ndim == 1) { + iterate_array_func = AK_iter_1d_array; + } + else { + iterate_array_func = AK_iter_2d_array; + } + + npy_intp dims = {size}; + PyArrayObject *is_dup = (PyArrayObject*)PyArray_Zeros(1, &dims, PyArray_DescrFromType(NPY_BOOL), 0); + npy_bool *is_dup_array = (npy_bool*)PyArray_DATA(is_dup); + + PyObject *set_obj = PySet_New(NULL); + if (!set_obj) { + return NULL; + } + + PyObject *dict_obj = NULL; + + // 2. Determine how to process each value + if (exclude_first ^ exclude_last) { + // 2.a This approach only needs a set! + handle_value_func = AK_handle_value_one_boundary; + } + else { + // 2.b Both of these approaches require an additional dictionary structure to keep track of some observed indices + dict_obj = PyDict_New(); + if (!dict_obj) { + goto failure; + } + + if (!exclude_first && !exclude_last) { + handle_value_func = AK_handle_value_exclude_boundaries; + } + else { + handle_value_func = AK_handle_value_include_boundaries; + } + } + + // 3. Execute + if (-1 == iterate_array_func(array, axis, reverse, is_dup_array, handle_value_func, set_obj, dict_obj)) { + goto failure; + } + + // 4. Post-process + if (exclude_first && exclude_last) { + // Mark the last observed location of each duplicate value as False + assert(dict_obj != NULL); + PyObject *last_duplicate_locations = dict_obj; // Meaningful name alias + + PyObject *value = NULL; // Borrowed + Py_ssize_t pos = 0; + + while (PyDict_Next(last_duplicate_locations, &pos, NULL, &value)) { + long idx = PyLong_AsLong(value); + if (idx == -1) { + goto failure; // -1 always means failure since no locations are negative + } + is_dup_array[idx] = NPY_FALSE; + } + } + + Py_XDECREF(dict_obj); + Py_DECREF(set_obj); + return (PyObject *)is_dup; + +failure: + Py_XDECREF(dict_obj); + Py_DECREF(set_obj); + return NULL; +} + //------------------------------------------------------------------------------ // ArrayGO //------------------------------------------------------------------------------ @@ -772,6 +1243,10 @@ static PyMethodDef arraykit_methods[] = { {"resolve_dtype_iter", resolve_dtype_iter, METH_O, NULL}, {"isna_element", isna_element, METH_O, NULL}, {"dtype_from_element", dtype_from_element, METH_O, NULL}, + {"array_to_duplicated_hashable", + (PyCFunction)array_to_duplicated_hashable, + METH_VARARGS | METH_KEYWORDS, + NULL}, {NULL}, }; diff --git a/test/test_util.py b/test/test_util.py index dcdc1c24..5ed47c33 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -16,6 +16,7 @@ from arraykit import array_deepcopy from arraykit import isna_element from arraykit import dtype_from_element +from performance.reference.util import array_to_duplicated_hashable from performance.reference.util import mloc as mloc_ref @@ -368,6 +369,80 @@ def test_dtype_from_element_str_and_bytes_dtypes(self) -> None: self.assertEqual(np.dtype(f'|S{size}'), dtype_from_element(bytes(size))) self.assertEqual(np.dtype(f' None: + a = array_to_duplicated_hashable(np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object), + exclude_first=False, exclude_last=False) + assert a.tolist() == [False, True, True, True, True, True, True, False, True, True, True, False] + + a = array_to_duplicated_hashable(np.array([0,1,2,2,1,4,5,3,4,5,5,6], dtype=object), + exclude_first=True, exclude_last=False) + assert a.tolist() == [False, False, False, True, True, False, False, False, True, True, True, False] + + def testarray_to_duplicated_hashable_b(self) -> None: + a = np.array([[50, 50, 32, 17, 17], [2,2,1,3,3]], dtype=object) + # find duplicate rows + post = array_to_duplicated_hashable(a, axis=0) + assert post.tolist() == [False, False] + + post = array_to_duplicated_hashable(a, axis=1) + assert post.tolist() == [True, True, False, True, True] + + post = array_to_duplicated_hashable(a, axis=1, exclude_first=True) + assert post.tolist() == [False, True, False, False, True] + + def testarray_to_duplicated_hashable_c(self) -> None: + c = array_to_duplicated_hashable(np.array(['q','q','q', 'a', 'w', 'w'], dtype=object), + exclude_first=False, exclude_last=False) + assert c.tolist() == [True, True, True, False, True, True] + + def testarray_to_duplicated_hashable_d(self) -> None: + # NOTE: these cases fail with hetergenous types as we cannot sort + a1 = np.array([0,0,1,0,None,None,0,1,None], dtype=object) + a2 = np.array([0,0,1,0,'q','q',0,1,'q'], dtype=object) + + for array in (a1, a2): + post1 = array_to_duplicated_hashable(array, exclude_first=False, exclude_last=False) + assert post1.tolist() == [True, True, True, True, True, True, True, True, True] + + post2 = array_to_duplicated_hashable(array, exclude_first=True, exclude_last=False) + assert post2.tolist() == [False, True, False, True, False, True, True, True, True] + + post3 = array_to_duplicated_hashable(array, exclude_first=False, exclude_last=True) + assert post3.tolist() == [True, True, True, True, True, True, False, False, False] + + post4 = array_to_duplicated_hashable(array, exclude_first=True, exclude_last=True) + assert post4.tolist() == [False, True, False, True, False, True, False, False, False] + + def testarray_to_duplicated_hashable_e(self) -> None: + array = np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object) + + post1 = array_to_duplicated_hashable(array, exclude_first=False, exclude_last=False) + assert post1.tolist() == [False, True, True] + + post2 = array_to_duplicated_hashable(array, exclude_first=True, exclude_last=False) + assert post2.tolist() == [False, False, True] + + post3 = array_to_duplicated_hashable(array, exclude_first=False, exclude_last=True) + assert post3.tolist() == [False, True, False] + + post4 = array_to_duplicated_hashable( array, exclude_first=True, exclude_last=True) + assert post4.tolist() == [False, False, False] + + def testarray_to_duplicated_hashable_f(self) -> None: + array = np.array([[None, None, None, 32, 17, 17], [2,2,2,False,'q','q'], [2,2,2,False,'q','q'], ], dtype=object) + + post1 = array_to_duplicated_hashable(array, axis=1, exclude_first=False, exclude_last=False) + assert post1.tolist() == [True, True, True, False, True, True] + + post2 = array_to_duplicated_hashable(array, axis=1, exclude_first=True, exclude_last=False) + assert post2.tolist() == [False, True, True, False, False, True] + + post3 = array_to_duplicated_hashable(array, axis=1, exclude_first=False, exclude_last=True) + assert post3.tolist() == [True, True, False, False, True, False] + + post4 = array_to_duplicated_hashable(array, axis=1, exclude_first=True, exclude_last=True) + assert post4.tolist() == [False, True, False, False, False, False] + if __name__ == '__main__': unittest.main()