From 80c1fba1077fdae23acbab1d1c434ca5f664b68b Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Fri, 26 Mar 2021 16:44:37 -0700
Subject: [PATCH 01/15] Adds reference code and initial impl for roll_1d.

---
 arraykit.c                    | 139 ++++++++++++++++++++++++++++++++++
 arraykit.pyi                  |   1 +
 performance/reference/util.py |  60 +++++++++++++++
 test/test_util.py             |  80 ++++++++++++++++++-
 4 files changed, 278 insertions(+), 2 deletions(-)

diff --git a/arraykit.c b/arraykit.c
index 00d29264..073c661a 100644
--- a/arraykit.c
+++ b/arraykit.c
@@ -270,6 +270,144 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg)
     return (PyObject *)AK_ResolveDTypeIter(arg);
 }
 
+//------------------------------------------------------------------------------
+// rolling
+
+static int
+assign_into_slice_from_slice(PyObject *dest, PyObject *src, PyObject *dest_slice, PyObject *src_slice)
+{
+    PyObject* shifted_src = PyObject_GetItem((PyObject*)src, src_slice);
+    if (!shifted_src) {
+        return -1;
+    }
+
+    int success = PyObject_SetItem(dest, dest_slice, shifted_src);
+    Py_DECREF(shifted_src);
+    return success;
+}
+
+static PyObject *
+roll_1d(PyObject *Py_UNUSED(m), PyObject *args)
+{
+    /* Algorithm.
+
+        size = len(array)
+        if size <= 1:
+            return array.copy()
+
+        shift = shift % size
+        if shift == 0:
+            return array.copy()
+
+        post = np.empty(size, dtype=array.dtype)
+        post[0:shift] = array[-shift:]
+        post[shift:] = array[0:-shift]
+        return post
+    */
+    PyArrayObject *array;
+    int shift;
+
+    if (!PyArg_ParseTuple(args, "O!i:roll_1d", &PyArray_Type, &array, &shift))
+    {
+        return NULL;
+    }
+
+    // Must be signed in order for modulo to work properly for negative shift values
+    int size = (int)PyArray_SIZE(array);
+
+    uint8_t is_empty = (size == 0);
+
+    if (!is_empty) {
+        shift = shift % size;
+    }
+
+    if (is_empty || (shift == 0)) {
+        PyObject* copy = PyArray_Copy(array);
+        if (!copy) {
+            return NULL;
+        }
+        return copy;
+    }
+
+    // Create an empty array
+    PyArray_Descr* dtype = PyArray_DESCR(array);
+    Py_INCREF(dtype); // PyArray_Empty steals a reference to dtype
+
+    PyObject* post = PyArray_Empty(
+                PyArray_NDIM(array),
+                PyArray_DIMS(array),
+                dtype,
+                0);
+    if (!post) {
+        return NULL;
+    }
+
+    // Build integers
+    PyObject* zero = PyLong_FromLong(0);
+    PyObject* pos_shift = PyLong_FromLong(shift);
+    PyObject* neg_shift = PyLong_FromLong(-shift);
+    if (!zero || !pos_shift || !neg_shift) {
+        goto integer_build_failure;
+    }
+
+    // Build slices
+    PyObject* first_dest_slice = PySlice_New(zero, pos_shift, Py_None);     // [0:shift]
+    PyObject* first_src_slice = PySlice_New(neg_shift, Py_None, Py_None);   // [-shift:]
+    PyObject* second_dest_slice = PySlice_New(pos_shift, Py_None, Py_None); // [shift:]
+    PyObject* second_src_slice = PySlice_New(zero, neg_shift, Py_None);     // [0:-shift]
+    Py_DECREF(zero);
+    Py_DECREF(pos_shift);
+    Py_DECREF(neg_shift);
+    if (!first_dest_slice || !first_src_slice || !second_dest_slice || !second_src_slice) {
+        goto slice_build_failure;
+    }
+
+    int success;
+
+    // First Assign
+    success = assign_into_slice_from_slice(post, (PyObject*)array, first_dest_slice, first_src_slice);
+    Py_DECREF(first_dest_slice);
+    Py_DECREF(first_src_slice);
+    if (success == -1) {
+        Py_DECREF(second_dest_slice);
+        Py_DECREF(second_src_slice);
+        goto failure;
+    }
+
+    // First Assign
+    success = assign_into_slice_from_slice(post, (PyObject*)array, second_dest_slice, second_src_slice);
+    Py_DECREF(second_src_slice);
+    Py_DECREF(second_dest_slice);
+    if (success == -1) {
+        goto failure;
+    }
+
+    return post;
+
+// Handled potentially leaked integer objects
+integer_build_failure:
+    Py_XDECREF(zero);
+    Py_XDECREF(pos_shift);
+    Py_XDECREF(neg_shift);
+    goto failure;
+
+// Handled potentially leaked slice objects
+slice_build_failure:
+    // Integers objects have all been cleaned up.
+    Py_XDECREF(first_dest_slice);
+    Py_XDECREF(first_src_slice);
+    Py_XDECREF(second_dest_slice);
+    Py_XDECREF(second_src_slice);
+    goto failure;
+
+// Handle final object that will always exist at this point.
+failure:
+    // Integers objects have all been cleaned up.
+    // Slice objects have all been cleaned up.
+    Py_DECREF(post);
+    return NULL;
+}
+
 //------------------------------------------------------------------------------
 // ArrayGO
 //------------------------------------------------------------------------------
@@ -546,6 +684,7 @@ static PyMethodDef arraykit_methods[] =  {
     {"row_1d_filter", row_1d_filter, METH_O, NULL},
     {"resolve_dtype", resolve_dtype, METH_VARARGS, NULL},
     {"resolve_dtype_iter", resolve_dtype_iter, METH_O, NULL},
+    {"roll_1d", roll_1d, METH_VARARGS, NULL},
     {NULL},
 };
 
diff --git a/arraykit.pyi b/arraykit.pyi
index b5a78afc..28c787db 100644
--- a/arraykit.pyi
+++ b/arraykit.pyi
@@ -27,3 +27,4 @@ def column_1d_filter(__array: np.array) -> np.ndarray: ...
 def row_1d_filter(__array: np.array) -> np.ndarray: ...
 def resolve_dtype(__d1: np.dtype, __d2: np.dtype) -> np.dtype: ...
 def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ...
+def roll_1d(__array: np.ndarray, __shift: int) -> np.ndarray: ...
diff --git a/performance/reference/util.py b/performance/reference/util.py
index 6d437b28..566c8bca 100644
--- a/performance/reference/util.py
+++ b/performance/reference/util.py
@@ -181,3 +181,63 @@ def array_deepcopy(
     if memo is not None:
         memo[ident] = post
     return post
+
+
+def roll_1d(array: np.ndarray, shift: int) -> np.ndarray:
+    '''
+    Specialized form of np.roll that, by focusing on the 1D solution, is at least four times faster.
+    '''
+    size = len(array)
+    if size <= 1:
+        return array.copy()
+
+    # result will be positive
+    shift = shift % size
+    if shift == 0:
+        return array.copy()
+
+    post = np.empty(size, dtype=array.dtype)
+
+    post[0:shift] = array[-shift:]
+    post[shift:] = array[0:-shift]
+    return post
+
+
+def roll_2d(array: np.ndarray,
+            shift: int,
+            axis: int
+            ) -> np.ndarray:
+    '''
+    Specialized form of np.roll that, by focusing on the 2D solution
+    '''
+    post = np.empty(array.shape, dtype=array.dtype)
+
+    if axis == 0: # roll rows
+        size = array.shape[0]
+        if size <= 1:
+            return array.copy()
+
+        # result will be positive
+        shift = shift % size
+        if shift == 0:
+            return array.copy()
+
+        post[0:shift, :] = array[-shift:, :]
+        post[shift:, :] = array[0:-shift, :]
+        return post
+
+    elif axis == 1: # roll columns
+        size = array.shape[1]
+        if size <= 1:
+            return array.copy()
+
+        # result will be positive
+        shift = shift % size
+        if shift == 0:
+            return array.copy()
+
+        post[:, 0:shift] = array[:, -shift:]
+        post[:, shift:] = array[:, 0:-shift]
+        return post
+
+    raise NotImplementedError()
diff --git a/test/test_util.py b/test/test_util.py
index 64c45cdd..49378517 100644
--- a/test/test_util.py
+++ b/test/test_util.py
@@ -12,6 +12,9 @@
 from arraykit import immutable_filter
 
 from performance.reference.util import mloc as mloc_ref
+#from performance.reference.util import roll_1d
+from arraykit import roll_1d
+from performance.reference.util import roll_2d
 
 
 class TestUnit(unittest.TestCase):
@@ -167,8 +170,81 @@ def test_row_1d_filter_a(self) -> None:
         with self.assertRaises(NotImplementedError):
             row_1d_filter(a1.reshape(1,2,5))
 
-if __name__ == '__main__':
-    unittest.main()
+    #---------------------------------------------------------------------------
+
+    def test_roll_1d_a(self) -> None:
+        a1 = np.arange(12)
+
+        for i in range(len(a1) + 1):
+            post = roll_1d(a1, i)
+            self.assertEqual(post.tolist(), np.roll(a1, i).tolist())
+
+            post = roll_1d(a1, -i)
+            self.assertEqual(post.tolist(), np.roll(a1, -i).tolist())
+
+    def test_roll_1d_b(self) -> None:
+        post = roll_1d(np.array([]), -4)
+        self.assertEqual([], post.tolist())
+
+    def test_roll_1d_c(self) -> None:
+        a1 = np.array([3, 4, 5, 6])
+        self.assertEqual(roll_1d(a1, 1).tolist(), [6, 3, 4, 5])
+        self.assertEqual(roll_1d(a1, -1).tolist(), [4, 5, 6, 3])
+
+    #---------------------------------------------------------------------------
+
+    def test_roll_2d_a(self) -> None:
+        a1 = np.arange(12).reshape((3,4))
+
+        for i in range(a1.shape[0] + 1):
+            post = roll_2d(a1, i, axis=0)
+            self.assertEqual(post.tolist(), np.roll(a1, i, axis=0).tolist())
+
+            post = roll_2d(a1, -i, axis=0)
+            self.assertEqual(post.tolist(), np.roll(a1, -i, axis=0).tolist())
+
+        for i in range(a1.shape[1] + 1):
+            post = roll_2d(a1, i, axis=1)
+            self.assertEqual(post.tolist(), np.roll(a1, i, axis=1).tolist())
 
+            post = roll_2d(a1, -i, axis=1)
+            self.assertEqual(post.tolist(), np.roll(a1, -i, axis=1).tolist())
 
+    def test_roll_2d_b(self) -> None:
+        post = roll_2d(np.array([[]]), -4, axis=1)
+        self.assertEqual(post.shape, (1, 0))
 
+    def test_roll_2d_c(self) -> None:
+        a1 = np.arange(12).reshape((3,4))
+
+        self.assertEqual(roll_2d(a1, -2, axis=0).tolist(),
+                [[8, 9, 10, 11], [0, 1, 2, 3], [4, 5, 6, 7]])
+
+        self.assertEqual(roll_2d(a1, -2, axis=1).tolist(),
+                [[2, 3, 0, 1], [6, 7, 4, 5], [10, 11, 8, 9]])
+
+    def test_roll_2d_d(self) -> None:
+        a1 = np.arange(6).reshape((2, 3))
+
+        self.assertEqual(roll_2d(a1, 1, axis=1).tolist(),
+                [[2, 0, 1], [5, 3, 4]])
+        self.assertEqual(roll_2d(a1, -1, axis=1).tolist(),
+                [[1, 2, 0], [4, 5, 3]])
+
+    def test_roll_2d_e(self) -> None:
+        a1 = np.arange(6).reshape((3, 2))
+
+        self.assertEqual(roll_2d(a1, 1, axis=0).tolist(),
+                [[4, 5], [0, 1], [2, 3]]
+                )
+        self.assertEqual(roll_2d(a1, -1, axis=0).tolist(),
+                [[2, 3], [4, 5], [0, 1]]
+                )
+
+    def test_roll_2d_f(self) -> None:
+        with self.assertRaises(NotImplementedError):
+            roll_2d(np.arange(4).reshape((2, 2)), 1, axis=2)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 90ffd3da35e17b9685b4cc5152abf1d6b452e3a2 Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Fri, 26 Mar 2021 16:54:24 -0700
Subject: [PATCH 02/15] Adds performance benchmark for roll_1d.

---
 performance/main.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/performance/main.py b/performance/main.py
index d0bd0809..9bd24d37 100644
--- a/performance/main.py
+++ b/performance/main.py
@@ -15,6 +15,7 @@
 from performance.reference.util import row_1d_filter as row_1d_filter_ref
 from performance.reference.util import resolve_dtype as resolve_dtype_ref
 from performance.reference.util import resolve_dtype_iter as resolve_dtype_iter_ref
+from performance.reference.util import roll_1d as roll_1d_ref
 
 from performance.reference.array_go import ArrayGO as ArrayGOREF
 
@@ -27,6 +28,7 @@
 from arraykit import row_1d_filter as row_1d_filter_ak
 from arraykit import resolve_dtype as resolve_dtype_ak
 from arraykit import resolve_dtype_iter as resolve_dtype_iter_ak
+from arraykit import roll_1d as roll_1d_ak
 
 from arraykit import ArrayGO as ArrayGOAK
 
@@ -221,6 +223,26 @@ class ArrayGOPerfREF(ArrayGOPerf):
     entry = staticmethod(ArrayGOREF)
 
 
+#-------------------------------------------------------------------------------
+class Roll1d(Perf):
+    NUMBER = 10
+    SIZE = 20_000
+
+    def pre(self):
+        self.array = np.arange(self.SIZE)
+
+    def main(self):
+        for i in range(-(self.SIZE+1), self.SIZE+1):
+            self.entry(self.array, i)
+
+
+class Roll1dAK(Roll1d):
+    entry = staticmethod(roll_1d_ak)
+
+class Roll1dREF(Roll1d):
+    entry = staticmethod(roll_1d_ref)
+
+
 #-------------------------------------------------------------------------------
 
 def get_arg_parser():

From f86238ae1863b4505402c67ecd18567ccd2aca82 Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Fri, 26 Mar 2021 19:05:12 -0700
Subject: [PATCH 03/15] Adds an alternative implementation for roll_1d.

---
 arraykit.c | 176 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 141 insertions(+), 35 deletions(-)

diff --git a/arraykit.c b/arraykit.c
index 073c661a..32967ae4 100644
--- a/arraykit.c
+++ b/arraykit.c
@@ -287,47 +287,14 @@ assign_into_slice_from_slice(PyObject *dest, PyObject *src, PyObject *dest_slice
 }
 
 static PyObject *
-roll_1d(PyObject *Py_UNUSED(m), PyObject *args)
+_roll_1d_a(PyArrayObject* array, int shift)
 {
-    /* Algorithm.
-
-        size = len(array)
-        if size <= 1:
-            return array.copy()
-
-        shift = shift % size
-        if shift == 0:
-            return array.copy()
-
+    /*
         post = np.empty(size, dtype=array.dtype)
         post[0:shift] = array[-shift:]
         post[shift:] = array[0:-shift]
         return post
     */
-    PyArrayObject *array;
-    int shift;
-
-    if (!PyArg_ParseTuple(args, "O!i:roll_1d", &PyArray_Type, &array, &shift))
-    {
-        return NULL;
-    }
-
-    // Must be signed in order for modulo to work properly for negative shift values
-    int size = (int)PyArray_SIZE(array);
-
-    uint8_t is_empty = (size == 0);
-
-    if (!is_empty) {
-        shift = shift % size;
-    }
-
-    if (is_empty || (shift == 0)) {
-        PyObject* copy = PyArray_Copy(array);
-        if (!copy) {
-            return NULL;
-        }
-        return copy;
-    }
 
     // Create an empty array
     PyArray_Descr* dtype = PyArray_DESCR(array);
@@ -408,6 +375,145 @@ roll_1d(PyObject *Py_UNUSED(m), PyObject *args)
     return NULL;
 }
 
+static PyObject *
+_roll_1d_b(PyArrayObject* array, int shift, int size)
+{
+    /*
+        post = np.empty(size, dtype=array.dtype)
+        post[0:shift] = array[-shift:]
+        post[shift:] = array[0:-shift]
+        return post
+    */
+
+    // Create an empty array
+    PyArray_Descr* dtype = PyArray_DESCR(array);
+    Py_INCREF(dtype); // PyArray_Empty steals a reference to dtype
+
+    PyArrayObject* post = (PyArrayObject*)PyArray_Empty(
+                PyArray_NDIM(array),
+                PyArray_DIMS(array),
+                dtype,
+                0);
+    if (!post) {
+        return NULL;
+    }
+
+    npy_intp array_stride = PyArray_STRIDE(array, 0);
+    npy_intp post_stride = PyArray_STRIDE(post, 0);
+    char* array_dataptr = PyArray_BYTES(array);
+    char* post_dataptr = PyArray_BYTES(post);
+
+    for (int i = 0; i < size; ++i) {
+        int src_i = (i + size - shift) % size;
+
+        PyObject* obj = PyArray_GETITEM(array, array_dataptr + (array_stride * src_i));
+        if (!obj) {
+            Py_DECREF(post);
+            return NULL;
+        }
+
+        if (PyArray_SETITEM(post, post_dataptr + (i * post_stride), obj) == -1) {
+            Py_DECREF(post);
+            return NULL;
+        }
+    }
+
+    return (PyObject*)post;
+}
+
+static PyObject *
+_roll_1d_c(PyArrayObject* array, int shift, int size)
+{
+    /*
+        post = np.empty(size, dtype=array.dtype)
+        post[0:shift] = array[-shift:]
+        post[shift:] = array[0:-shift]
+        return post
+    */
+
+    // Create an empty array
+    PyArray_Descr* dtype = PyArray_DESCR(array);
+    Py_INCREF(dtype); // PyArray_Empty steals a reference to dtype
+
+    PyArrayObject* post = (PyArrayObject*)PyArray_Empty(
+                PyArray_NDIM(array),
+                PyArray_DIMS(array),
+                dtype,
+                0);
+    if (!post) {
+        return NULL;
+    }
+
+    npy_intp array_stride = PyArray_STRIDE(array, 0);
+    npy_intp post_stride = PyArray_STRIDE(post, 0);
+    char* array_dataptr = PyArray_BYTES(array);
+    char* post_dataptr = PyArray_BYTES(post);
+
+    for (int i = 0; i < size; ++i) {
+        int src_i = (i + size - shift) % size;
+
+        PyObject* obj = PyArray_GETITEM(array, array_dataptr + (array_stride * src_i));
+        if (!obj) {
+            Py_DECREF(post);
+            return NULL;
+        }
+
+        if (PyArray_SETITEM(post, post_dataptr + (i * post_stride), obj) == -1) {
+            Py_DECREF(post);
+            return NULL;
+        }
+    }
+
+    return (PyObject*)post;
+}
+
+static PyObject *
+roll_1d(PyObject *Py_UNUSED(m), PyObject *args)
+{
+    /* Algorithm.
+
+        size = len(array)
+        if size <= 1:
+            return array.copy()
+
+        shift = shift % size
+        if shift == 0:
+            return array.copy()
+
+        post = np.empty(size, dtype=array.dtype)
+        post[0:shift] = array[-shift:]
+        post[shift:] = array[0:-shift]
+        return post
+    */
+    PyArrayObject *array;
+    int shift;
+
+    if (!PyArg_ParseTuple(args, "O!i:roll_1d", &PyArray_Type, &array, &shift))
+    {
+        return NULL;
+    }
+
+    // Must be signed in order for modulo to work properly for negative shift values
+    int size = (int)PyArray_SIZE(array);
+
+    uint8_t is_empty = (size == 0);
+
+    if (!is_empty) {
+        shift = shift % size;
+    }
+
+    if (is_empty || (shift == 0)) {
+        PyObject* copy = PyArray_Copy(array);
+        if (!copy) {
+            return NULL;
+        }
+        return copy;
+    }
+    return _roll_1d_a(array, shift);
+    return _roll_1d_b(array, shift, size);
+}
+
+
 //------------------------------------------------------------------------------
 // ArrayGO
 //------------------------------------------------------------------------------

From fae903845ecb066944d943d1bbd87a29f8eeb7be Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Fri, 26 Mar 2021 20:13:26 -0700
Subject: [PATCH 04/15] Finishes third_implementation.

---
 arraykit.c | 88 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 53 insertions(+), 35 deletions(-)

diff --git a/arraykit.c b/arraykit.c
index 32967ae4..c64244ba 100644
--- a/arraykit.c
+++ b/arraykit.c
@@ -422,49 +422,67 @@ _roll_1d_b(PyArrayObject* array, int shift, int size)
 }
 
 static PyObject *
-_roll_1d_c(PyArrayObject* array, int shift, int size)
+_roll_1d_c(PyArrayObject *array, int shift)
 {
-    /*
-        post = np.empty(size, dtype=array.dtype)
-        post[0:shift] = array[-shift:]
-        post[shift:] = array[0:-shift]
-        return post
-    */
-
-    // Create an empty array
-    PyArray_Descr* dtype = PyArray_DESCR(array);
-    Py_INCREF(dtype); // PyArray_Empty steals a reference to dtype
+    // Tell the constructor to automatically allocate the output.
+    // The data type of the output will match that of the input.
+    PyArrayObject *arrays[2];
+    npy_uint32 arrays_flags[2];
+    arrays[0] = array;
+    arrays[1] = NULL;
+    arrays_flags[0] = NPY_ITER_READONLY;
+    arrays_flags[1] = NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE;
+
+    // Construct the iterator
+    NpyIter *iter = NpyIter_MultiNew(
+            2,                      // number of arrays
+            arrays,
+            NPY_ITER_EXTERNAL_LOOP, // No inner iteration - inner loop is handled by CopyArray code
+            NPY_KEEPORDER,          // Maintain existing order
+            NPY_NO_CASTING,         // Only allows identical types
+            arrays_flags,
+            NULL);                  // We don't have to specify dtypes since it will use array's
+
+    if (iter == NULL) {
+        return NULL;
+    }
 
-    PyArrayObject* post = (PyArrayObject*)PyArray_Empty(
-                PyArray_NDIM(array),
-                PyArray_DIMS(array),
-                dtype,
-                0);
-    if (!post) {
+    NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL);
+    if (!iternext) {
+        NpyIter_Deallocate(iter);
         return NULL;
     }
 
-    npy_intp array_stride = PyArray_STRIDE(array, 0);
-    npy_intp post_stride = PyArray_STRIDE(post, 0);
-    char* array_dataptr = PyArray_BYTES(array);
-    char* post_dataptr = PyArray_BYTES(post);
+    char** dataptr = NpyIter_GetDataPtrArray(iter);
+    npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter);
+    npy_intp itemsize = NpyIter_GetDescrArray(iter)[0]->elsize;
 
-    for (int i = 0; i < size; ++i) {
-        int src_i = (i + size - shift) % size;
+    do {
+        char* src_data = dataptr[0];
+        char* dst_data = dataptr[1];
+        npy_intp size = *sizeptr;
 
-        PyObject* obj = PyArray_GETITEM(array, array_dataptr + (array_stride * src_i));
-        if (!obj) {
-            Py_DECREF(post);
-            return NULL;
-        }
+        int offset = ((size - shift) % size) * itemsize;
+        int first_chunk = (size * itemsize) - offset;
 
-        if (PyArray_SETITEM(post, post_dataptr + (i * post_stride), obj) == -1) {
-            Py_DECREF(post);
-            return NULL;
-        }
+        memcpy(dst_data, src_data + offset, first_chunk);
+        memcpy(dst_data + first_chunk, src_data, offset);
+    } while (iternext(iter));
+
+    // Get the result from the iterator object array
+    PyObject *ret = (PyObject*)NpyIter_GetOperandArray(iter)[1];
+    if (!ret) {
+        NpyIter_Deallocate(iter);
+        return NULL;
     }
+    Py_INCREF(ret);
 
-    return (PyObject*)post;
+    if (NpyIter_Deallocate(iter) != NPY_SUCCEED) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    return ret;
 }
 
 static PyObject *
@@ -509,8 +527,8 @@ roll_1d(PyObject *Py_UNUSED(m), PyObject *args)
         }
         return copy;
     }
-    return _roll_1d_a(array, shift);
-    return _roll_1d_b(array, shift, size);
+
+    return _roll_1d_c(array, shift);
 }
 
 

From 7927694ebfebb72c75e11ad4a56e04263a5d7eb6 Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Fri, 26 Mar 2021 21:16:12 -0700
Subject: [PATCH 05/15] So little to show for so much struggle.... Object
 arrays are hard.

---
 arraykit.c        | 5 +++--
 test/test_util.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arraykit.c b/arraykit.c
index c64244ba..939c31c4 100644
--- a/arraykit.c
+++ b/arraykit.c
@@ -457,6 +457,7 @@ _roll_1d_c(PyArrayObject *array, int shift)
     npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter);
     npy_intp itemsize = NpyIter_GetDescrArray(iter)[0]->elsize;
 
+    // TODO: This does NOT work with objects....
     do {
         char* src_data = dataptr[0];
         char* dst_data = dataptr[1];
@@ -470,7 +471,7 @@ _roll_1d_c(PyArrayObject *array, int shift)
     } while (iternext(iter));
 
     // Get the result from the iterator object array
-    PyObject *ret = (PyObject*)NpyIter_GetOperandArray(iter)[1];
+    PyArrayObject *ret = NpyIter_GetOperandArray(iter)[1];
     if (!ret) {
         NpyIter_Deallocate(iter);
         return NULL;
@@ -482,7 +483,7 @@ _roll_1d_c(PyArrayObject *array, int shift)
         return NULL;
     }
 
-    return ret;
+    return (PyObject*)ret;
 }
 
 static PyObject *
diff --git a/test/test_util.py b/test/test_util.py
index 49378517..f41d825b 100644
--- a/test/test_util.py
+++ b/test/test_util.py
@@ -173,7 +173,7 @@ def test_row_1d_filter_a(self) -> None:
     #---------------------------------------------------------------------------
 
     def test_roll_1d_a(self) -> None:
-        a1 = np.arange(12)
+        a1 = np.arange(12, dtype=float)
 
         for i in range(len(a1) + 1):
             post = roll_1d(a1, i)

From bb31b962c3233351dc6391fdb47f3f42728a7d94 Mon Sep 17 00:00:00 2001
From: Charles Burkland <charles.aburkland@gmail.com>
Date: Mon, 29 Mar 2021 00:08:47 -0700
Subject: [PATCH 06/15] Adds support for object dtype arrays.

---
 arraykit.c | 75 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 60 insertions(+), 15 deletions(-)

diff --git a/arraykit.c b/arraykit.c
index 939c31c4..802a6508 100644
--- a/arraykit.c
+++ b/arraykit.c
@@ -433,15 +433,29 @@ _roll_1d_c(PyArrayObject *array, int shift)
     arrays_flags[0] = NPY_ITER_READONLY;
     arrays_flags[1] = NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE;
 
+    // No inner iteration - inner loop is handled by CopyArray code
+    // Reference objects are OK.
+    int iter_flags = NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK;
+
     // Construct the iterator
     NpyIter *iter = NpyIter_MultiNew(
-            2,                      // number of arrays
+            2,              // number of arrays
             arrays,
-            NPY_ITER_EXTERNAL_LOOP, // No inner iteration - inner loop is handled by CopyArray code
-            NPY_KEEPORDER,          // Maintain existing order
-            NPY_NO_CASTING,         // Only allows identical types
+            iter_flags,
+            NPY_KEEPORDER,  // Maintain existing order for `array`
+            NPY_NO_CASTING, // Both arrays will have the same dtype so casting isn't needed or allowed
             arrays_flags,
-            NULL);                  // We don't have to specify dtypes since it will use array's
+            NULL);          // We don't have to specify dtypes since it will use array's
+
+    /* Per the documentation for NPY_ITER_REFS_OK:
+
+        Indicates that arrays with reference types (object arrays or structured arrays
+        containing an object type) may be accepted and used in the iterator. If this flag
+        is enabled, the caller must be sure to check whether NpyIter_IterationNeedsAPI(iter)
+        is true, in which case it may not release the GIL during iteration.
+
+        However, `NpyIter_IterationNeedsAPI` is not documented at all. So.......
+    */
 
     if (iter == NULL) {
         return NULL;
@@ -457,18 +471,49 @@ _roll_1d_c(PyArrayObject *array, int shift)
     npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter);
     npy_intp itemsize = NpyIter_GetDescrArray(iter)[0]->elsize;
 
-    // TODO: This does NOT work with objects....
-    do {
-        char* src_data = dataptr[0];
-        char* dst_data = dataptr[1];
-        npy_intp size = *sizeptr;
+    if (!PyDataType_ISOBJECT(PyArray_DESCR(array))) {
+        do {
+            char* src_data = dataptr[0];
+            char* dst_data = dataptr[1];
+            npy_intp size = *sizeptr;
+
+            int offset = ((size - shift) % size) * itemsize;
+            int first_chunk = (size * itemsize) - offset;
+
+            memcpy(dst_data, src_data + offset, first_chunk);
+            memcpy(dst_data + first_chunk, src_data, offset);
+        } while (iternext(iter));
+    }
+    else {
+        // Object arrays contain pointers to arrays.
+        do {
+            char* src_data = dataptr[0];
+            char* dst_data = dataptr[1];
+            npy_intp size = *sizeptr;
 
-        int offset = ((size - shift) % size) * itemsize;
-        int first_chunk = (size * itemsize) - offset;
+            PyObject* src_ref = NULL;
+            PyObject* dst_ref = NULL;
 
-        memcpy(dst_data, src_data + offset, first_chunk);
-        memcpy(dst_data + first_chunk, src_data, offset);
-    } while (iternext(iter));
+            for (int i = 0; i < size; ++i) {
+                int offset = ((i + size - shift) % size) * itemsize;
+
+                // Update our temp PyObject* 's
+                memcpy(&src_ref, src_data + offset, sizeof(src_ref));
+                memcpy(&dst_ref, dst_data, sizeof(dst_ref));
+
+                // Copy the reference
+                memcpy(dst_data, &src_ref, sizeof(src_ref));
+
+                // Claim the reference
+                Py_XINCREF(src_ref);
+
+                // Release the reference in dst
+                Py_XDECREF(dst_ref);
+
+                dst_data += itemsize;
+            }
+        } while (iternext(iter));
+    }
 
     // Get the result from the iterator object array
     PyArrayObject *ret = NpyIter_GetOperandArray(iter)[1];

From 971ed2ccae052e8b8e15969eb7dd5887776f476b Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Mon, 29 Mar 2021 13:18:24 -0700
Subject: [PATCH 07/15] Addresses significant speed issue with last object
 approach. Adds more perf tests.

---
 arraykit.c          | 130 +++++++++++++++++++++++++++++++++++++++++---
 performance/main.py |  73 +++++++++++++++++++++++--
 2 files changed, 188 insertions(+), 15 deletions(-)

diff --git a/arraykit.c b/arraykit.c
index 802a6508..dd0b97d0 100644
--- a/arraykit.c
+++ b/arraykit.c
@@ -286,14 +286,15 @@ assign_into_slice_from_slice(PyObject *dest, PyObject *src, PyObject *dest_slice
     return success;
 }
 
+// Naive Re-implementation of C
 static PyObject *
 _roll_1d_a(PyArrayObject* array, int shift)
 {
     /*
-        post = np.empty(size, dtype=array.dtype)
-        post[0:shift] = array[-shift:]
-        post[shift:] = array[0:-shift]
-        return post
+        cls           ak           ref          ref/ak
+        Roll1dInt     3.32787074   4.06750092   1.22225328
+        Roll1dFloat   3.32698173   4.06643037   1.2222581
+        Roll1dObject  37.89614459  38.76268129  1.02286609
     */
 
     // Create an empty array
@@ -375,14 +376,15 @@ _roll_1d_a(PyArrayObject* array, int shift)
     return NULL;
 }
 
+// Manual iteration using Numpy C api
 static PyObject *
 _roll_1d_b(PyArrayObject* array, int shift, int size)
 {
     /*
-        post = np.empty(size, dtype=array.dtype)
-        post[0:shift] = array[-shift:]
-        post[shift:] = array[0:-shift]
-        return post
+        cls           ak          ref         ref/ak
+        Roll1dInt     3.94763173  0.13514971  0.03423564
+        Roll1dFloat   3.95269516  0.13621643  0.03446166
+        Roll1dObject  1.03418866  0.46459488  0.4492361
     */
 
     // Create an empty array
@@ -421,9 +423,16 @@ _roll_1d_b(PyArrayObject* array, int shift, int size)
     return (PyObject*)post;
 }
 
+// Being clever with C for primitives, struggling with Objects
 static PyObject *
 _roll_1d_c(PyArrayObject *array, int shift)
 {
+    /*
+        cls           ak           ref          ref/ak
+        Roll1dInt     2.82467638   4.14947038   1.46900736
+        Roll1dFloat   2.89442847   4.13699139   1.42929474
+        Roll1dObject  112.6879144  38.81264949  0.34442602
+    */
     // Tell the constructor to automatically allocate the output.
     // The data type of the output will match that of the input.
     PyArrayObject *arrays[2];
@@ -531,6 +540,106 @@ _roll_1d_c(PyArrayObject *array, int shift)
     return (PyObject*)ret;
 }
 
+// Being clever with C for primitives, and figuring out Objects
+static PyObject *
+_roll_1d_d(PyArrayObject *array, int shift)
+{
+    /*
+        Roll1d20kInt     2.91365521  4.25724612  1.46113586
+        Roll1d20kFloat   3.21448036  4.40039245  1.36892809
+        Roll1d20kObject  6.7969062   8.32454664  1.22475526
+        Roll1d1kInt      0.33637808  1.32518703  3.93957601
+        Roll1d1kFloat    0.32248451  1.24809331  3.87024272
+        Roll1d1kObject   1.46907919  2.9891046   2.03467901
+    */
+    // Tell the constructor to automatically allocate the output.
+    // The data type of the output will match that of the input.
+    PyArrayObject *arrays[2];
+    npy_uint32 arrays_flags[2];
+    arrays[0] = array;
+    arrays[1] = NULL;
+    arrays_flags[0] = NPY_ITER_READONLY;
+    arrays_flags[1] = NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE;
+
+    // No inner iteration - inner loop is handled by CopyArray code
+    // Reference objects are OK.
+    int iter_flags = NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK;
+
+    // Construct the iterator
+    NpyIter *iter = NpyIter_MultiNew(
+            2,              // number of arrays
+            arrays,
+            iter_flags,
+            NPY_KEEPORDER,  // Maintain existing order for `array`
+            NPY_NO_CASTING, // Both arrays will have the same dtype so casting isn't needed or allowed
+            arrays_flags,
+            NULL);          // We don't have to specify dtypes since it will use array's
+
+    /* Per the documentation for NPY_ITER_REFS_OK:
+
+        Indicates that arrays with reference types (object arrays or structured arrays
+        containing an object type) may be accepted and used in the iterator. If this flag
+        is enabled, the caller must be sure to check whether NpyIter_IterationNeedsAPI(iter)
+        is true, in which case it may not release the GIL during iteration.
+
+        However, `NpyIter_IterationNeedsAPI` is not documented at all. So.......
+    */
+
+    if (iter == NULL) {
+        return NULL;
+    }
+
+    NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL);
+    if (!iternext) {
+        NpyIter_Deallocate(iter);
+        return NULL;
+    }
+
+    char** dataptr = NpyIter_GetDataPtrArray(iter);
+    npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter);
+    npy_intp itemsize = NpyIter_GetDescrArray(iter)[0]->elsize;
+
+    uint8_t is_object = PyDataType_ISOBJECT(PyArray_DESCR(array));
+
+    do {
+        char* src_data = dataptr[0];
+        char* dst_data = dataptr[1];
+        npy_intp size = *sizeptr;
+
+        int offset = ((size - shift) % size) * itemsize;
+        int first_chunk = (size * itemsize) - offset;
+
+        memcpy(dst_data, src_data + offset, first_chunk);
+        memcpy(dst_data + first_chunk, src_data, offset);
+
+        // Increment ref counts of objects.
+        if (PyDataType_ISOBJECT(PyArray_DESCR(array))) {
+            dst_data = dataptr[1];
+            for (int i = 0; i < size; ++i) {
+                PyObject* dst_ref = NULL;
+                memcpy(&dst_ref, dst_data, sizeof(dst_ref));
+                Py_INCREF(dst_ref);
+                dst_data += itemsize;
+            }
+        }
+    } while (iternext(iter));
+
+    // Get the result from the iterator object array
+    PyArrayObject *ret = NpyIter_GetOperandArray(iter)[1];
+    if (!ret) {
+        NpyIter_Deallocate(iter);
+        return NULL;
+    }
+    Py_INCREF(ret);
+
+    if (NpyIter_Deallocate(iter) != NPY_SUCCEED) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    return (PyObject*)ret;
+}
+
 static PyObject *
 roll_1d(PyObject *Py_UNUSED(m), PyObject *args)
 {
@@ -574,7 +683,10 @@ roll_1d(PyObject *Py_UNUSED(m), PyObject *args)
         return copy;
     }
 
-    return _roll_1d_c(array, shift);
+    //return _roll_1d_a(array, shift);       // Basically the same
+    //return _roll_1d_b(array, shift, size); // Way slower
+    //return _roll_1d_c(array, shift);       // Faster for primitives, same for objects
+    return _roll_1d_d(array, shift);         // Faster for primitives & objects!
 }
 
 
diff --git a/performance/main.py b/performance/main.py
index 9bd24d37..2e39b59b 100644
--- a/performance/main.py
+++ b/performance/main.py
@@ -223,8 +223,14 @@ class ArrayGOPerfREF(ArrayGOPerf):
     entry = staticmethod(ArrayGOREF)
 
 
+storage = []
+def build_subclassses(klass, ak_meth, ref_meth):
+    storage.append(type(f'{klass.__name__}AK', (klass,), dict(entry=staticmethod(ak_meth))))
+    storage.append(type(f'{klass.__name__}REF', (klass,), dict(entry=staticmethod(ref_meth))))
+
+
 #-------------------------------------------------------------------------------
-class Roll1d(Perf):
+class Roll1d20kInt(Perf):
     NUMBER = 10
     SIZE = 20_000
 
@@ -232,16 +238,71 @@ def pre(self):
         self.array = np.arange(self.SIZE)
 
     def main(self):
-        for i in range(-(self.SIZE+1), self.SIZE+1):
+        for i in range(-20_001, 20_001):
             self.entry(self.array, i)
 
+class Roll1d20kFloat(Perf):
+    NUMBER = 10
+    SIZE = 20_000
+
+    def pre(self):
+        self.array = np.arange(self.SIZE).astype(float)
 
-class Roll1dAK(Roll1d):
-    entry = staticmethod(roll_1d_ak)
+    def main(self):
+        for i in range(-20_001, 20_001):
+            self.entry(self.array, i)
+
+class Roll1d20kObject(Perf):
+    NUMBER = 2
+    SIZE = 20_000
+
+    def pre(self):
+        self.array = np.arange(self.SIZE).astype(object)
+
+    def main(self):
+        for i in range(-20_001, 20_001):
+            self.entry(self.array, i)
+
+class Roll1d1kInt(Perf):
+    NUMBER = 10
+    SIZE = 1_000
+
+    def pre(self):
+        self.array = np.arange(self.SIZE)
+
+    def main(self):
+        for i in range(-20_000, 20_000):
+            self.entry(self.array, i)
+
+class Roll1d1kFloat(Perf):
+    NUMBER = 10
+    SIZE = 1_000
+
+    def pre(self):
+        self.array = np.arange(self.SIZE).astype(float)
+
+    def main(self):
+        for i in range(-20_000, 20_000):
+            self.entry(self.array, i)
+
+class Roll1d1kObject(Perf):
+    NUMBER = 10
+    SIZE = 1_000
+
+    def pre(self):
+        self.array = np.arange(self.SIZE).astype(object)
+
+    def main(self):
+        for i in range(-20_000, 20_000):
+            self.entry(self.array, i)
 
-class Roll1dREF(Roll1d):
-    entry = staticmethod(roll_1d_ref)
 
+build_subclassses(Roll1d20kInt, roll_1d_ak, roll_1d_ref)
+build_subclassses(Roll1d20kFloat, roll_1d_ak, roll_1d_ref)
+build_subclassses(Roll1d20kObject, roll_1d_ak, roll_1d_ref)
+build_subclassses(Roll1d1kInt, roll_1d_ak, roll_1d_ref)
+build_subclassses(Roll1d1kFloat, roll_1d_ak, roll_1d_ref)
+build_subclassses(Roll1d1kObject, roll_1d_ak, roll_1d_ref)
 
 #-------------------------------------------------------------------------------
 

From 8843c09c255590a6537f38d0de470b3d249281eb Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Mon, 29 Mar 2021 20:34:25 -0700
Subject: [PATCH 08/15] Cleans up misc code.

---
 arraykit.c | 72 +++++++++---------------------------------------------
 1 file changed, 12 insertions(+), 60 deletions(-)

diff --git a/arraykit.c b/arraykit.c
index dd0b97d0..af4422ea 100644
--- a/arraykit.c
+++ b/arraykit.c
@@ -274,14 +274,15 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg)
 // rolling
 
 static int
-assign_into_slice_from_slice(PyObject *dest, PyObject *src, PyObject *dest_slice, PyObject *src_slice)
+assign_into_slice_from_slice(PyObject *dst, int dst_start, int dst_stop,
+                             PyObject *src, int src_start, int src_stop)
 {
-    PyObject* shifted_src = PyObject_GetItem((PyObject*)src, src_slice);
+    PyObject* shifted_src = PySequence_GetSlice(src, src_start, src_stop);
     if (!shifted_src) {
         return -1;
     }
 
-    int success = PyObject_SetItem(dest, dest_slice, shifted_src);
+    int success = PySequence_SetSlice(dst, dst_start, dst_stop, shifted_src);
     Py_DECREF(shifted_src);
     return success;
 }
@@ -310,70 +311,23 @@ _roll_1d_a(PyArrayObject* array, int shift)
         return NULL;
     }
 
-    // Build integers
-    PyObject* zero = PyLong_FromLong(0);
-    PyObject* pos_shift = PyLong_FromLong(shift);
-    PyObject* neg_shift = PyLong_FromLong(-shift);
-    if (!zero || !pos_shift || !neg_shift) {
-        goto integer_build_failure;
-    }
-
-    // Build slices
-    PyObject* first_dest_slice = PySlice_New(zero, pos_shift, Py_None);     // [0:shift]
-    PyObject* first_src_slice = PySlice_New(neg_shift, Py_None, Py_None);   // [-shift:]
-    PyObject* second_dest_slice = PySlice_New(pos_shift, Py_None, Py_None); // [shift:]
-    PyObject* second_src_slice = PySlice_New(zero, neg_shift, Py_None);     // [0:-shift]
-    Py_DECREF(zero);
-    Py_DECREF(pos_shift);
-    Py_DECREF(neg_shift);
-    if (!first_dest_slice || !first_src_slice || !second_dest_slice || !second_src_slice) {
-        goto slice_build_failure;
-    }
-
     int success;
 
     // First Assign
-    success = assign_into_slice_from_slice(post, (PyObject*)array, first_dest_slice, first_src_slice);
-    Py_DECREF(first_dest_slice);
-    Py_DECREF(first_src_slice);
+    success = assign_into_slice_from_slice(post, 0, shift, (PyObject*)array, -shift, PyArray_SIZE(array));
     if (success == -1) {
-        Py_DECREF(second_dest_slice);
-        Py_DECREF(second_src_slice);
-        goto failure;
+        Py_DECREF(post);
+        return NULL;
     }
 
-    // First Assign
-    success = assign_into_slice_from_slice(post, (PyObject*)array, second_dest_slice, second_src_slice);
-    Py_DECREF(second_src_slice);
-    Py_DECREF(second_dest_slice);
+    // Second Assign
+    success = assign_into_slice_from_slice(post, shift, PyArray_SIZE(array), (PyObject*)array, 0, -shift);
     if (success == -1) {
-        goto failure;
+        Py_DECREF(post);
+        return NULL;
     }
 
     return post;
-
-// Handled potentially leaked integer objects
-integer_build_failure:
-    Py_XDECREF(zero);
-    Py_XDECREF(pos_shift);
-    Py_XDECREF(neg_shift);
-    goto failure;
-
-// Handled potentially leaked slice objects
-slice_build_failure:
-    // Integers objects have all been cleaned up.
-    Py_XDECREF(first_dest_slice);
-    Py_XDECREF(first_src_slice);
-    Py_XDECREF(second_dest_slice);
-    Py_XDECREF(second_src_slice);
-    goto failure;
-
-// Handle final object that will always exist at this point.
-failure:
-    // Integers objects have all been cleaned up.
-    // Slice objects have all been cleaned up.
-    Py_DECREF(post);
-    return NULL;
 }
 
 // Manual iteration using Numpy C api
@@ -599,8 +553,6 @@ _roll_1d_d(PyArrayObject *array, int shift)
     npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter);
     npy_intp itemsize = NpyIter_GetDescrArray(iter)[0]->elsize;
 
-    uint8_t is_object = PyDataType_ISOBJECT(PyArray_DESCR(array));
-
     do {
         char* src_data = dataptr[0];
         char* dst_data = dataptr[1];
@@ -615,7 +567,7 @@ _roll_1d_d(PyArrayObject *array, int shift)
         // Increment ref counts of objects.
         if (PyDataType_ISOBJECT(PyArray_DESCR(array))) {
             dst_data = dataptr[1];
-            for (int i = 0; i < size; ++i) {
+            while (size--) {
                 PyObject* dst_ref = NULL;
                 memcpy(&dst_ref, dst_data, sizeof(dst_ref));
                 Py_INCREF(dst_ref);

From 92209ed7b4c22543bbfbd732d794c3b6e793ff97 Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Mon, 29 Mar 2021 23:16:38 -0700
Subject: [PATCH 09/15] Adds initial support & really cool ASCII art for
 roll_2d.

---
 arraykit.c                    | 320 ++++++++++++++++++++++++++++++++++
 arraykit.pyi                  |   1 +
 performance/reference/util.py |   5 +-
 test/test_util.py             |  19 +-
 4 files changed, 339 insertions(+), 6 deletions(-)

diff --git a/arraykit.c b/arraykit.c
index af4422ea..189fd5a9 100644
--- a/arraykit.c
+++ b/arraykit.c
@@ -641,6 +641,325 @@ roll_1d(PyObject *Py_UNUSED(m), PyObject *args)
     return _roll_1d_d(array, shift);         // Faster for primitives & objects!
 }
 
+// -----------------------------------------------------------------------------
+
+static PyObject *
+_roll_2d_a(PyArrayObject *array, uint32_t shift, int axis)
+{
+    /*
+    if axis == 0: # roll rows
+        post[0:shift, :] = array[-shift:, :]
+        post[shift:, :] = array[0:-shift, :]
+        return post
+
+    # roll columns
+    post[:, 0:shift] = array[:, -shift:]
+    post[:, shift:] = array[:, 0:-shift]
+    */
+    // Tell the constructor to automatically allocate the output.
+    // The data type of the output will match that of the input.
+    PyArrayObject *arrays[2];
+    npy_uint32 arrays_flags[2];
+    arrays[0] = array;
+    arrays[1] = NULL;
+    arrays_flags[0] = NPY_ITER_READONLY;
+    arrays_flags[1] = NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE;
+
+    // No inner iteration - inner loop is handled by CopyArray code
+    // Reference objects are OK.
+    int iter_flags = NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK;
+
+    // Construct the iterator
+    NpyIter *iter = NpyIter_MultiNew(
+            2,              // number of arrays
+            arrays,
+            iter_flags,
+            NPY_KEEPORDER,
+            NPY_NO_CASTING, // Both arrays will have the same dtype so casting isn't needed or allowed
+            arrays_flags,
+            NULL);          // We don't have to specify dtypes since it will use array's
+
+    /* Per the documentation for NPY_ITER_REFS_OK:
+
+        Indicates that arrays with reference types (object arrays or structured arrays
+        containing an object type) may be accepted and used in the iterator. If this flag
+        is enabled, the caller must be sure to check whether NpyIter_IterationNeedsAPI(iter)
+        is true, in which case it may not release the GIL during iteration.
+
+        However, `NpyIter_IterationNeedsAPI` is not documented at all. So.......
+    */
+
+    if (iter == NULL) {
+        return NULL;
+    }
+
+    NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL);
+    if (!iternext) {
+        NpyIter_Deallocate(iter);
+        return NULL;
+    }
+
+    char** dataptr = NpyIter_GetDataPtrArray(iter);
+    npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter);
+    npy_intp itemsize = NpyIter_GetDescrArray(iter)[0]->elsize;
+
+    uint32_t NUM_ROWS = PyArray_DIM(array, 0); // 3 rows
+    uint32_t rowsize  = PyArray_DIM(array, 1); // 5 cols (or 5 elements in each row)
+
+    do {
+        char* src_data = dataptr[0];
+        char* dst_data = dataptr[1];
+        npy_intp size = *sizeptr;
+
+        if (axis == 0) {
+            /*
+            Shift by rows! This is the easy case.
+
+            Imagine we have this array:
+            [0 1 2]
+            [3 4 5]
+            [6 7 8]
+
+            In memory, this is stored contiguously as: [0 1 2 3 4 5 6 7 8]
+            Placing parentheses, we can visualize where the columns are like so:
+                [(0 1 2) (3 4 5) (6 7 8)]
+
+            Given this, all we are concerned about is two contiguous blocks of memory.
+
+            For example, if shift = 1, we can copy from row[1] -> END to the front
+
+            source = [(0 1 2) (3 4 5) (6 7 8)]
+                               | | |   | | |
+                         -----------------
+                       | | |   | | |
+                       V V V   V V V
+            buffer = [(3 4 5) (6 7 8) (X X X)]
+
+            Now, we fill in the missing tail bytes with row[0] from the src buffer
+
+            source = [(0 1 2) (3 4 5) (6 7 8)]
+                       | | |
+                         -----------------
+                                       | | |
+                                       V V V
+            buffer = [(3 4 5) (6 7 8) (0 1 2)]
+
+            Now, our internal memory represents the result of a row shift.
+            We can see this if we represent the final buffer as a 2D grid:
+
+            [6 7 8]
+            [0 1 2]
+            [3 4 5]
+            */
+
+            // Easiest case! Merely shift the rows
+            int offset = ((NUM_ROWS - shift) % NUM_ROWS) * rowsize * itemsize;
+            int first_chunk = (size * itemsize) - offset;
+
+            memcpy(dst_data, src_data + offset, first_chunk);
+            memcpy(dst_data + first_chunk, src_data, offset);
+        }
+        else {
+            /*
+            Shift by columns! This is the more difficult case.
+
+            Let's use a slightly different array
+            [0 1 2 3 4]
+            [5 6 7 8 9]
+            [A B C D E]
+
+            If we shift by 2, our goal array will be:
+            [3 4 0 1 2]
+            [8 9 5 6 7]
+            [D E A B C]
+
+            Alternatively, we want our contiguous memory to go from:
+
+            source = [(0 1 2 3 4) (5 6 7 8 9) (A B C D E)]
+            buffer = [(3 4 0 1 2) (8 9 5 6 7) (D E A B C)]
+
+            In order to do this as efficiently as possible, we first fill the result buffer with the source shifted.
+
+            source = [(0 1 2 3 4) (5 6 7 8 9) (A B C D E)]
+                        \ \ \ \ \   \ \ \ \ \   \ \ \
+                         \ \ \  ----  \ \ \ ----  \ \ \
+                          \ \ \    \ \ \ \ \   \ \ \ \ \
+            buffer = [(X X 0 1 2) (3 4 5 6 7) (8 9 A B C)]
+
+            Now, all that's left is to fix the incorrect values
+
+            buffer = [(X X 0 1 2) (3 4 5 6 7) (8 9 A B C)]
+                       ^ ^         ^ ^         ^ ^
+
+            We can fill these by copying the values from each row
+
+            source = [(0 1 2 3 4) (5 6 7 8 9) (A B C D E)]
+                             | |         | |         | |
+                        -------     -------     -------
+                       | |         | |         | |
+                       V V         V V         V V
+            buffer = [(3 4 0 1 2) (8 9 5 6 7) (D E A B C)]
+
+            Now, our internal memory represents the result of a row shift.
+            We can see this if we represent the final buffer as a 2D grid:
+
+            [3 4 0 1 2]
+            [8 9 5 6 7]
+            [D E A B C]
+            */
+            if (shift > rowsize / 2) {
+                /* SHIFT LEFT
+
+                This branch is optimized for cases where the offset is greater than half of the columns.
+
+                For this, instead of shifting right and being forced to fill in a large section for each row,
+                we shift left and only have to fill in small section
+
+                Example:
+
+                Inefficient
+                [0 1 2 3 4]   [0 1 2 3 4]
+                 \               | | | |
+                  ------        -------
+                         \     | | | |
+                         V     V V V V
+                [X X X X 0]   [1 2 3 4 0]
+
+                Efficient
+                [0 1 2 3 4]   [0 1 2 3 4]
+                  / / / /      |
+                  | | | |       -------
+                  | | | |              |
+                 / / / /               V
+                [1 2 3 4 X]   [1 2 3 4 0]
+                */
+                int offset = (rowsize - shift) * itemsize;
+                int num_bytes = (size * itemsize) - offset;
+                memcpy(dst_data, src_data+offset, num_bytes);
+
+                num_bytes = offset; // This is how much we need to copy for each column.
+
+                // Update the shifted portion of each row.
+                for (size_t i = 0; i < NUM_ROWS; ++i) {
+                    int row_offset = i * rowsize * itemsize;
+
+                    // We need to fill in the rightmost values of this row since we shifted by an offset
+                    int dst_offset = row_offset + ((rowsize * itemsize) - offset);
+                    int src_offset = row_offset;
+
+                    memcpy(dst_data + dst_offset, src_data + src_offset, num_bytes);
+                }
+            }
+            else {
+                // SHIFT RIGHT
+                int offset = shift * itemsize;
+                int num_bytes = (size * itemsize) - offset;
+                memcpy(dst_data+offset, src_data, num_bytes);
+
+                num_bytes = offset; // This is how much we need to copy for each column.
+
+                // Update the shifted portion of each row.
+                for (size_t i = 0; i < NUM_ROWS; ++i) {
+                    int row_offset = i * rowsize * itemsize;
+
+                    // We need to fill in the leftmost values of this row since we shifted by an offset
+                    int dst_offset = row_offset;
+                    int src_offset = row_offset + ((rowsize - shift) * itemsize);
+
+                    memcpy(dst_data + dst_offset, src_data + src_offset, num_bytes);
+                }
+            }
+        }
+    } while (iternext(iter));
+
+    // Get the result from the iterator object array
+    PyArrayObject *ret = NpyIter_GetOperandArray(iter)[1];
+    if (!ret) {
+        NpyIter_Deallocate(iter);
+        return NULL;
+    }
+    Py_INCREF(ret);
+
+    if (NpyIter_Deallocate(iter) != NPY_SUCCEED) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    return (PyObject*)ret;
+}
+
+static PyObject *
+roll_2d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
+{
+    /* Algorithm.
+
+        size = array.shape[axis]
+
+        if shift != 0:
+            shift = shift % size
+
+        if size <= 1 or shift == 0:
+            return array.copy()
+
+        if shift < 0:
+            shift = size + shift
+
+        if axis == 0:
+            post[0:shift, :] = array[-shift:, :]
+            post[shift:, :] = array[0:-shift, :]
+            return post
+
+        post[:, 0:shift] = array[:, -shift:]
+        post[:, shift:] = array[:, 0:-shift]
+        return post
+    */
+    PyArrayObject *array;
+    int shift;
+    int axis;
+
+    static char *kwlist[] = {"array", "shift", "axis", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!ii:roll_1d",
+                                     kwlist,
+                                     &PyArray_Type, &array,
+                                     &shift, &axis))
+    {
+        return NULL;
+    }
+
+    if (axis != 0 && axis != 1) {
+        PyErr_SetString(PyExc_ValueError, "Axis must be 0 or 1");
+        return NULL;
+    }
+
+    if (PyArray_NDIM(array) != 2) {
+        PyErr_SetString(PyExc_ValueError, "Array must be 2D");
+        return NULL;
+    }
+
+    // Must be signed in order for modulo to work properly for negative shift values
+    int size = (int)PyArray_DIM(array, axis);
+
+    uint8_t is_empty = (size == 0);
+
+    if (!is_empty) {
+        shift = shift % size;
+        if (shift < 0) {
+            shift = size + shift;
+        }
+    }
+
+    if (is_empty || (shift == 0)) {
+        PyObject* copy = PyArray_Copy(array);
+        if (!copy) {
+            return NULL;
+        }
+        return copy;
+    }
+
+    return _roll_2d_a(array, (uint32_t)shift, axis);
+}
+
 
 //------------------------------------------------------------------------------
 // ArrayGO
@@ -919,6 +1238,7 @@ static PyMethodDef arraykit_methods[] =  {
     {"resolve_dtype", resolve_dtype, METH_VARARGS, NULL},
     {"resolve_dtype_iter", resolve_dtype_iter, METH_O, NULL},
     {"roll_1d", roll_1d, METH_VARARGS, NULL},
+    {"roll_2d", (PyCFunction)roll_2d, METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL},
 };
 
diff --git a/arraykit.pyi b/arraykit.pyi
index 28c787db..9ede4fc0 100644
--- a/arraykit.pyi
+++ b/arraykit.pyi
@@ -28,3 +28,4 @@ def row_1d_filter(__array: np.array) -> np.ndarray: ...
 def resolve_dtype(__d1: np.dtype, __d2: np.dtype) -> np.dtype: ...
 def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ...
 def roll_1d(__array: np.ndarray, __shift: int) -> np.ndarray: ...
+def roll_2d(__array: np.ndarray, __shift: int, __axis: int) -> np.ndarray: ...
diff --git a/performance/reference/util.py b/performance/reference/util.py
index 566c8bca..a9c4a8f9 100644
--- a/performance/reference/util.py
+++ b/performance/reference/util.py
@@ -203,10 +203,7 @@ def roll_1d(array: np.ndarray, shift: int) -> np.ndarray:
     return post
 
 
-def roll_2d(array: np.ndarray,
-            shift: int,
-            axis: int
-            ) -> np.ndarray:
+def roll_2d(array: np.ndarray, shift: int, axis: int) -> np.ndarray:
     '''
     Specialized form of np.roll that, by focusing on the 2D solution
     '''
diff --git a/test/test_util.py b/test/test_util.py
index f41d825b..75784466 100644
--- a/test/test_util.py
+++ b/test/test_util.py
@@ -14,7 +14,7 @@
 from performance.reference.util import mloc as mloc_ref
 #from performance.reference.util import roll_1d
 from arraykit import roll_1d
-from performance.reference.util import roll_2d
+from arraykit import roll_2d
 
 
 class TestUnit(unittest.TestCase):
@@ -192,6 +192,21 @@ def test_roll_1d_c(self) -> None:
         self.assertEqual(roll_1d(a1, -1).tolist(), [4, 5, 6, 3])
 
     #---------------------------------------------------------------------------
+    def test_roll_2d_row(self) -> None:
+        arr = np.arange(15).reshape(3, 5)
+
+        for shift in range(-10, 10):
+            np_result = np.roll(arr, shift, axis=0)
+            ak_result = roll_2d(arr, shift, axis=0)
+            self.assertTrue((np_result == ak_result).all())
+
+    def test_roll_2d_col(self) -> None:
+        arr = np.arange(15).reshape(3, 5)
+
+        for shift in range(-10, 10):
+            np_result = np.roll(arr, shift, axis=1)
+            ak_result = roll_2d(arr, shift, axis=1)
+            self.assertTrue((np_result == ak_result).all())
 
     def test_roll_2d_a(self) -> None:
         a1 = np.arange(12).reshape((3,4))
@@ -242,7 +257,7 @@ def test_roll_2d_e(self) -> None:
                 )
 
     def test_roll_2d_f(self) -> None:
-        with self.assertRaises(NotImplementedError):
+        with self.assertRaises(ValueError):
             roll_2d(np.arange(4).reshape((2, 2)), 1, axis=2)
 
 

From 387706e5062703f3d554e9bce95639a6a37ca0c4 Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Tue, 30 Mar 2021 11:39:48 -0700
Subject: [PATCH 10/15] Removes unnecessary memcpy. Addresses conversion
 warnings.

---
 arraykit.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/arraykit.c b/arraykit.c
index 189fd5a9..a1bffced 100644
--- a/arraykit.c
+++ b/arraykit.c
@@ -190,9 +190,9 @@ shape_filter(PyObject *Py_UNUSED(m), PyObject *a)
     AK_CHECK_NUMPY_ARRAY_1D_2D(a);
     PyArrayObject *array = (PyArrayObject *)a;
 
-    int size0 = PyArray_DIM(array, 0);
+    int size0 = (int)PyArray_DIM(array, 0);
     // If 1D array, set size for axis 1 at 1, else use 2D array to get the size of axis 1
-    int size1 = PyArray_NDIM(array) == 1 ? 1 : PyArray_DIM(array, 1);
+    int size1 = (int)(PyArray_NDIM(array) == 1 ? 1 : PyArray_DIM(array, 1));
     return Py_BuildValue("ii", size0, size1);
 }
 
@@ -314,14 +314,14 @@ _roll_1d_a(PyArrayObject* array, int shift)
     int success;
 
     // First Assign
-    success = assign_into_slice_from_slice(post, 0, shift, (PyObject*)array, -shift, PyArray_SIZE(array));
+    success = assign_into_slice_from_slice(post, 0, shift, (PyObject*)array, -shift, (int)PyArray_SIZE(array));
     if (success == -1) {
         Py_DECREF(post);
         return NULL;
     }
 
     // Second Assign
-    success = assign_into_slice_from_slice(post, shift, PyArray_SIZE(array), (PyObject*)array, 0, -shift);
+    success = assign_into_slice_from_slice(post, shift, (int)PyArray_SIZE(array), (PyObject*)array, 0, -shift);
     if (success == -1) {
         Py_DECREF(post);
         return NULL;
@@ -440,8 +440,8 @@ _roll_1d_c(PyArrayObject *array, int shift)
             char* dst_data = dataptr[1];
             npy_intp size = *sizeptr;
 
-            int offset = ((size - shift) % size) * itemsize;
-            int first_chunk = (size * itemsize) - offset;
+            npy_intp offset = ((size - shift) % size) * itemsize;
+            npy_intp first_chunk = (size * itemsize) - offset;
 
             memcpy(dst_data, src_data + offset, first_chunk);
             memcpy(dst_data + first_chunk, src_data, offset);
@@ -458,7 +458,7 @@ _roll_1d_c(PyArrayObject *array, int shift)
             PyObject* dst_ref = NULL;
 
             for (int i = 0; i < size; ++i) {
-                int offset = ((i + size - shift) % size) * itemsize;
+                npy_intp offset = ((i + size - shift) % size) * itemsize;
 
                 // Update our temp PyObject* 's
                 memcpy(&src_ref, src_data + offset, sizeof(src_ref));
@@ -558,8 +558,8 @@ _roll_1d_d(PyArrayObject *array, int shift)
         char* dst_data = dataptr[1];
         npy_intp size = *sizeptr;
 
-        int offset = ((size - shift) % size) * itemsize;
-        int first_chunk = (size * itemsize) - offset;
+        npy_intp offset = ((size - shift) % size) * itemsize;
+        npy_intp first_chunk = (size * itemsize) - offset;
 
         memcpy(dst_data, src_data + offset, first_chunk);
         memcpy(dst_data + first_chunk, src_data, offset);
@@ -568,9 +568,7 @@ _roll_1d_d(PyArrayObject *array, int shift)
         if (PyDataType_ISOBJECT(PyArray_DESCR(array))) {
             dst_data = dataptr[1];
             while (size--) {
-                PyObject* dst_ref = NULL;
-                memcpy(&dst_ref, dst_data, sizeof(dst_ref));
-                Py_INCREF(dst_ref);
+                Py_INCREF(*(PyObject**)dst_data);
                 dst_data += itemsize;
             }
         }
@@ -703,8 +701,8 @@ _roll_2d_a(PyArrayObject *array, uint32_t shift, int axis)
     npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter);
     npy_intp itemsize = NpyIter_GetDescrArray(iter)[0]->elsize;
 
-    uint32_t NUM_ROWS = PyArray_DIM(array, 0); // 3 rows
-    uint32_t rowsize  = PyArray_DIM(array, 1); // 5 cols (or 5 elements in each row)
+    uint32_t NUM_ROWS = (uint32_t)PyArray_DIM(array, 0);
+    uint32_t rowsize  = (uint32_t)PyArray_DIM(array, 1);
 
     do {
         char* src_data = dataptr[0];

From a487ea2f92319b845ff1d84ed1c8e716f4c89604 Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Tue, 30 Mar 2021 17:02:01 -0700
Subject: [PATCH 11/15] Updates some types and clears up ambiguity of some
 arithmatic expressions.

---
 arraykit.c | 63 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 29 deletions(-)

diff --git a/arraykit.c b/arraykit.c
index a1bffced..ab5089db 100644
--- a/arraykit.c
+++ b/arraykit.c
@@ -633,16 +633,19 @@ roll_1d(PyObject *Py_UNUSED(m), PyObject *args)
         return copy;
     }
 
-    //return _roll_1d_a(array, shift);       // Basically the same
-    //return _roll_1d_b(array, shift, size); // Way slower
-    //return _roll_1d_c(array, shift);       // Faster for primitives, same for objects
+    // Silence UnuSEd fUnCTioN warnings.
+    if (0) {
+        return _roll_1d_a(array, shift);       // Basically the same
+        return _roll_1d_b(array, shift, size); // Way slower
+        return _roll_1d_c(array, shift);       // Faster for primitives, same for objects
+    }
     return _roll_1d_d(array, shift);         // Faster for primitives & objects!
 }
 
 // -----------------------------------------------------------------------------
 
 static PyObject *
-_roll_2d_a(PyArrayObject *array, uint32_t shift, int axis)
+_roll_2d_a(PyArrayObject *array, npy_uint shift, int axis)
 {
     /*
     if axis == 0: # roll rows
@@ -701,13 +704,15 @@ _roll_2d_a(PyArrayObject *array, uint32_t shift, int axis)
     npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter);
     npy_intp itemsize = NpyIter_GetDescrArray(iter)[0]->elsize;
 
-    uint32_t NUM_ROWS = (uint32_t)PyArray_DIM(array, 0);
-    uint32_t rowsize  = (uint32_t)PyArray_DIM(array, 1);
+    npy_uint NUM_ROWS = (npy_uint)PyArray_DIM(array, 0);
+    npy_uint rowsize  = (npy_uint)PyArray_DIM(array, 1);
+    npy_uint bytes_in_row = rowsize * itemsize;
 
     do {
-        char* src_data = dataptr[0];
-        char* dst_data = dataptr[1];
+        char *src_data = dataptr[0];
+        char *dst_data = dataptr[1];
         npy_intp size = *sizeptr;
+        npy_uint total_bytes = size * itemsize;
 
         if (axis == 0) {
             /*
@@ -719,12 +724,12 @@ _roll_2d_a(PyArrayObject *array, uint32_t shift, int axis)
             [6 7 8]
 
             In memory, this is stored contiguously as: [0 1 2 3 4 5 6 7 8]
-            Placing parentheses, we can visualize where the columns are like so:
+            Placing parentheses, we can visualize where the rows are like so:
                 [(0 1 2) (3 4 5) (6 7 8)]
 
             Given this, all we are concerned about is two contiguous blocks of memory.
 
-            For example, if shift = 1, we can copy from row[1] -> END to the front
+            For example, if shift = -1, we can copy from row[1] -> END to the front
 
             source = [(0 1 2) (3 4 5) (6 7 8)]
                                | | |   | | |
@@ -745,17 +750,17 @@ _roll_2d_a(PyArrayObject *array, uint32_t shift, int axis)
             Now, our internal memory represents the result of a row shift.
             We can see this if we represent the final buffer as a 2D grid:
 
+            [3 4 5]
             [6 7 8]
             [0 1 2]
-            [3 4 5]
             */
 
             // Easiest case! Merely shift the rows
-            int offset = ((NUM_ROWS - shift) % NUM_ROWS) * rowsize * itemsize;
-            int first_chunk = (size * itemsize) - offset;
+            npy_intp offset = (NUM_ROWS - shift) * bytes_in_row;
+            npy_intp chunksize = total_bytes - offset;
 
-            memcpy(dst_data, src_data + offset, first_chunk);
-            memcpy(dst_data + first_chunk, src_data, offset);
+            memcpy(dst_data, src_data + offset, chunksize);
+            memcpy(dst_data + chunksize, src_data, offset);
         }
         else {
             /*
@@ -813,7 +818,7 @@ _roll_2d_a(PyArrayObject *array, uint32_t shift, int axis)
                 For this, instead of shifting right and being forced to fill in a large section for each row,
                 we shift left and only have to fill in small section
 
-                Example:
+                Example: Shift by 4
 
                 Inefficient
                 [0 1 2 3 4]   [0 1 2 3 4]
@@ -831,38 +836,38 @@ _roll_2d_a(PyArrayObject *array, uint32_t shift, int axis)
                  / / / /               V
                 [1 2 3 4 X]   [1 2 3 4 0]
                 */
-                int offset = (rowsize - shift) * itemsize;
-                int num_bytes = (size * itemsize) - offset;
-                memcpy(dst_data, src_data+offset, num_bytes);
+                npy_intp offset = (rowsize - shift) * itemsize;
+                npy_intp num_bytes = total_bytes - offset;
+                memcpy(dst_data, src_data + offset, num_bytes);
 
                 num_bytes = offset; // This is how much we need to copy for each column.
 
                 // Update the shifted portion of each row.
                 for (size_t i = 0; i < NUM_ROWS; ++i) {
-                    int row_offset = i * rowsize * itemsize;
+                    npy_intp row_offset = i * bytes_in_row;
 
                     // We need to fill in the rightmost values of this row since we shifted by an offset
-                    int dst_offset = row_offset + ((rowsize * itemsize) - offset);
-                    int src_offset = row_offset;
+                    npy_intp dst_offset = row_offset + bytes_in_row - num_bytes;
+                    npy_intp src_offset = row_offset;
 
                     memcpy(dst_data + dst_offset, src_data + src_offset, num_bytes);
                 }
             }
             else {
                 // SHIFT RIGHT
-                int offset = shift * itemsize;
-                int num_bytes = (size * itemsize) - offset;
+                npy_intp offset = shift * itemsize;
+                npy_intp num_bytes = total_bytes - offset;
                 memcpy(dst_data+offset, src_data, num_bytes);
 
                 num_bytes = offset; // This is how much we need to copy for each column.
 
                 // Update the shifted portion of each row.
                 for (size_t i = 0; i < NUM_ROWS; ++i) {
-                    int row_offset = i * rowsize * itemsize;
+                    npy_intp row_offset = i * bytes_in_row;
 
                     // We need to fill in the leftmost values of this row since we shifted by an offset
-                    int dst_offset = row_offset;
-                    int src_offset = row_offset + ((rowsize - shift) * itemsize);
+                    npy_intp dst_offset = row_offset;
+                    npy_intp src_offset = row_offset + ((rowsize - shift) * itemsize);
 
                     memcpy(dst_data + dst_offset, src_data + src_offset, num_bytes);
                 }
@@ -913,7 +918,7 @@ roll_2d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
     */
     PyArrayObject *array;
     int shift;
-    int axis;
+    int axis; // npy_intp
 
     static char *kwlist[] = {"array", "shift", "axis", NULL};
 
@@ -955,7 +960,7 @@ roll_2d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
         return copy;
     }
 
-    return _roll_2d_a(array, (uint32_t)shift, axis);
+    return _roll_2d_a(array, (npy_uint)shift, axis);
 }
 
 

From 7d5f41a92ef8ff06e8077623bf3185fd64b18f38 Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Tue, 30 Mar 2021 18:29:33 -0700
Subject: [PATCH 12/15] Implements a roll_2d approach that handles bytes
 manually.

---
 arraykit.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 2 deletions(-)

diff --git a/arraykit.c b/arraykit.c
index ab5089db..d920da2f 100644
--- a/arraykit.c
+++ b/arraykit.c
@@ -891,6 +891,85 @@ _roll_2d_a(PyArrayObject *array, npy_uint shift, int axis)
     return (PyObject*)ret;
 }
 
+static PyObject *
+_roll_2d_b(PyArrayObject *array, npy_uint shift, int axis)
+{
+    // Create an empty array
+    PyArray_Descr* dtype = PyArray_DESCR(array);
+    Py_INCREF(dtype); // PyArray_Empty steals a reference to dtype
+
+    PyArrayObject* post = (PyArrayObject*)PyArray_Empty(
+                PyArray_NDIM(array),
+                PyArray_DIMS(array),
+                dtype,
+                0);
+    if (!post) {
+        return NULL;
+    }
+
+    npy_intp itemsize = dtype->elsize;
+    npy_intp size = PyArray_SIZE(array);
+
+    char* src_data = PyArray_BYTES(array);
+    char* dst_data = PyArray_BYTES(post);
+
+    npy_uint NUM_ROWS = (npy_uint)PyArray_DIM(array, 0);
+    npy_uint rowsize  = (npy_uint)PyArray_DIM(array, 1);
+    npy_uint bytes_in_row = rowsize * itemsize;
+    npy_uint total_bytes = size * itemsize;
+
+    if (axis == 0) {
+        // Easiest case! Merely shift the rows
+        npy_intp offset = (NUM_ROWS - shift) * bytes_in_row;
+        npy_intp chunksize = total_bytes - offset;
+
+        memcpy(dst_data, src_data + offset, chunksize);
+        memcpy(dst_data + chunksize, src_data, offset);
+    }
+    else {
+        if (shift > rowsize / 2) {
+            // SHIFT LEFT
+            npy_intp offset = (rowsize - shift) * itemsize;
+            npy_intp num_bytes = total_bytes - offset;
+            memcpy(dst_data, src_data + offset, num_bytes);
+
+            num_bytes = offset; // This is how much we need to copy for each column.
+
+            // Update the shifted portion of each row.
+            for (size_t i = 0; i < NUM_ROWS; ++i) {
+                npy_intp row_offset = i * bytes_in_row;
+
+                // We need to fill in the rightmost values of this row since we shifted by an offset
+                npy_intp dst_offset = row_offset + bytes_in_row - num_bytes;
+                npy_intp src_offset = row_offset;
+
+                memcpy(dst_data + dst_offset, src_data + src_offset, num_bytes);
+            }
+        }
+        else {
+            // SHIFT RIGHT
+            npy_intp offset = shift * itemsize;
+            npy_intp num_bytes = total_bytes - offset;
+            memcpy(dst_data + offset, src_data, num_bytes);
+
+            num_bytes = offset; // This is how much we need to copy for each column.
+
+            // Update the shifted portion of each row.
+            for (size_t i = 0; i < NUM_ROWS; ++i) {
+                npy_intp row_offset = i * bytes_in_row;
+
+                // We need to fill in the leftmost values of this row since we shifted by an offset
+                npy_intp dst_offset = row_offset;
+                npy_intp src_offset = row_offset + ((rowsize - shift) * itemsize);
+
+                memcpy(dst_data + dst_offset, src_data + src_offset, num_bytes);
+            }
+        }
+    }
+
+    return (PyObject*)post;
+}
+
 static PyObject *
 roll_2d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
 {
@@ -943,7 +1022,7 @@ roll_2d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
     // Must be signed in order for modulo to work properly for negative shift values
     int size = (int)PyArray_DIM(array, axis);
 
-    uint8_t is_empty = (size == 0);
+    npy_uint8 is_empty = (size == 0);
 
     if (!is_empty) {
         shift = shift % size;
@@ -960,7 +1039,10 @@ roll_2d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
         return copy;
     }
 
-    return _roll_2d_a(array, (npy_uint)shift, axis);
+    if (0) {
+        return _roll_2d_a(array, (npy_uint)shift, axis);
+    }
+    return _roll_2d_b(array, (npy_uint)shift, axis);
 }
 
 

From 05b9849b5aa5bacf1e4ad7af8c96f23594a7d26c Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Fri, 9 Apr 2021 13:08:12 -0700
Subject: [PATCH 13/15] Removes roll_2d -> nothing to improve. Removes old
 roll_1d impls. Adds multithreading option for 1d.

---
 arraykit.c                    | 674 +---------------------------------
 arraykit.pyi                  |   1 -
 performance/reference/util.py |  37 --
 test/test_util.py             |  70 ----
 4 files changed, 8 insertions(+), 774 deletions(-)

diff --git a/arraykit.c b/arraykit.c
index d920da2f..b17060eb 100644
--- a/arraykit.c
+++ b/arraykit.c
@@ -273,120 +273,9 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg)
 //------------------------------------------------------------------------------
 // rolling
 
-static int
-assign_into_slice_from_slice(PyObject *dst, int dst_start, int dst_stop,
-                             PyObject *src, int src_start, int src_stop)
-{
-    PyObject* shifted_src = PySequence_GetSlice(src, src_start, src_stop);
-    if (!shifted_src) {
-        return -1;
-    }
-
-    int success = PySequence_SetSlice(dst, dst_start, dst_stop, shifted_src);
-    Py_DECREF(shifted_src);
-    return success;
-}
-
-// Naive Re-implementation of C
-static PyObject *
-_roll_1d_a(PyArrayObject* array, int shift)
-{
-    /*
-        cls           ak           ref          ref/ak
-        Roll1dInt     3.32787074   4.06750092   1.22225328
-        Roll1dFloat   3.32698173   4.06643037   1.2222581
-        Roll1dObject  37.89614459  38.76268129  1.02286609
-    */
-
-    // Create an empty array
-    PyArray_Descr* dtype = PyArray_DESCR(array);
-    Py_INCREF(dtype); // PyArray_Empty steals a reference to dtype
-
-    PyObject* post = PyArray_Empty(
-                PyArray_NDIM(array),
-                PyArray_DIMS(array),
-                dtype,
-                0);
-    if (!post) {
-        return NULL;
-    }
-
-    int success;
-
-    // First Assign
-    success = assign_into_slice_from_slice(post, 0, shift, (PyObject*)array, -shift, (int)PyArray_SIZE(array));
-    if (success == -1) {
-        Py_DECREF(post);
-        return NULL;
-    }
-
-    // Second Assign
-    success = assign_into_slice_from_slice(post, shift, (int)PyArray_SIZE(array), (PyObject*)array, 0, -shift);
-    if (success == -1) {
-        Py_DECREF(post);
-        return NULL;
-    }
-
-    return post;
-}
-
-// Manual iteration using Numpy C api
 static PyObject *
-_roll_1d_b(PyArrayObject* array, int shift, int size)
+_roll_1d(PyArrayObject *array, int shift)
 {
-    /*
-        cls           ak          ref         ref/ak
-        Roll1dInt     3.94763173  0.13514971  0.03423564
-        Roll1dFloat   3.95269516  0.13621643  0.03446166
-        Roll1dObject  1.03418866  0.46459488  0.4492361
-    */
-
-    // Create an empty array
-    PyArray_Descr* dtype = PyArray_DESCR(array);
-    Py_INCREF(dtype); // PyArray_Empty steals a reference to dtype
-
-    PyArrayObject* post = (PyArrayObject*)PyArray_Empty(
-                PyArray_NDIM(array),
-                PyArray_DIMS(array),
-                dtype,
-                0);
-    if (!post) {
-        return NULL;
-    }
-
-    npy_intp array_stride = PyArray_STRIDE(array, 0);
-    npy_intp post_stride = PyArray_STRIDE(post, 0);
-    char* array_dataptr = PyArray_BYTES(array);
-    char* post_dataptr = PyArray_BYTES(post);
-
-    for (int i = 0; i < size; ++i) {
-        int src_i = (i + size - shift) % size;
-
-        PyObject* obj = PyArray_GETITEM(array, array_dataptr + (array_stride * src_i));
-        if (!obj) {
-            Py_DECREF(post);
-            return NULL;
-        }
-
-        if (PyArray_SETITEM(post, post_dataptr + (i * post_stride), obj) == -1) {
-            Py_DECREF(post);
-            return NULL;
-        }
-    }
-
-    return (PyObject*)post;
-}
-
-// Being clever with C for primitives, struggling with Objects
-static PyObject *
-_roll_1d_c(PyArrayObject *array, int shift)
-{
-    /*
-        cls           ak           ref          ref/ak
-        Roll1dInt     2.82467638   4.14947038   1.46900736
-        Roll1dFloat   2.89442847   4.13699139   1.42929474
-        Roll1dObject  112.6879144  38.81264949  0.34442602
-    */
     // Tell the constructor to automatically allocate the output.
     // The data type of the output will match that of the input.
     PyArrayObject *arrays[2];
@@ -410,16 +299,6 @@ _roll_1d_c(PyArrayObject *array, int shift)
             arrays_flags,
             NULL);          // We don't have to specify dtypes since it will use array's
 
-    /* Per the documentation for NPY_ITER_REFS_OK:
-
-        Indicates that arrays with reference types (object arrays or structured arrays
-        containing an object type) may be accepted and used in the iterator. If this flag
-        is enabled, the caller must be sure to check whether NpyIter_IterationNeedsAPI(iter)
-        is true, in which case it may not release the GIL during iteration.
-
-        However, `NpyIter_IterationNeedsAPI` is not documented at all. So.......
-    */
-
     if (iter == NULL) {
         return NULL;
     }
@@ -434,125 +313,12 @@ _roll_1d_c(PyArrayObject *array, int shift)
     npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter);
     npy_intp itemsize = NpyIter_GetDescrArray(iter)[0]->elsize;
 
-    if (!PyDataType_ISOBJECT(PyArray_DESCR(array))) {
-        do {
-            char* src_data = dataptr[0];
-            char* dst_data = dataptr[1];
-            npy_intp size = *sizeptr;
-
-            npy_intp offset = ((size - shift) % size) * itemsize;
-            npy_intp first_chunk = (size * itemsize) - offset;
-
-            memcpy(dst_data, src_data + offset, first_chunk);
-            memcpy(dst_data + first_chunk, src_data, offset);
-        } while (iternext(iter));
-    }
-    else {
-        // Object arrays contain pointers to arrays.
-        do {
-            char* src_data = dataptr[0];
-            char* dst_data = dataptr[1];
-            npy_intp size = *sizeptr;
-
-            PyObject* src_ref = NULL;
-            PyObject* dst_ref = NULL;
-
-            for (int i = 0; i < size; ++i) {
-                npy_intp offset = ((i + size - shift) % size) * itemsize;
-
-                // Update our temp PyObject* 's
-                memcpy(&src_ref, src_data + offset, sizeof(src_ref));
-                memcpy(&dst_ref, dst_data, sizeof(dst_ref));
-
-                // Copy the reference
-                memcpy(dst_data, &src_ref, sizeof(src_ref));
-
-                // Claim the reference
-                Py_XINCREF(src_ref);
-
-                // Release the reference in dst
-                Py_XDECREF(dst_ref);
-
-                dst_data += itemsize;
-            }
-        } while (iternext(iter));
-    }
-
-    // Get the result from the iterator object array
-    PyArrayObject *ret = NpyIter_GetOperandArray(iter)[1];
-    if (!ret) {
-        NpyIter_Deallocate(iter);
-        return NULL;
-    }
-    Py_INCREF(ret);
-
-    if (NpyIter_Deallocate(iter) != NPY_SUCCEED) {
-        Py_DECREF(ret);
-        return NULL;
-    }
-
-    return (PyObject*)ret;
-}
-
-// Being clever with C for primitives, and figuring out Objects
-static PyObject *
-_roll_1d_d(PyArrayObject *array, int shift)
-{
-    /*
-        Roll1d20kInt     2.91365521  4.25724612  1.46113586
-        Roll1d20kFloat   3.21448036  4.40039245  1.36892809
-        Roll1d20kObject  6.7969062   8.32454664  1.22475526
-        Roll1d1kInt      0.33637808  1.32518703  3.93957601
-        Roll1d1kFloat    0.32248451  1.24809331  3.87024272
-        Roll1d1kObject   1.46907919  2.9891046   2.03467901
-    */
-    // Tell the constructor to automatically allocate the output.
-    // The data type of the output will match that of the input.
-    PyArrayObject *arrays[2];
-    npy_uint32 arrays_flags[2];
-    arrays[0] = array;
-    arrays[1] = NULL;
-    arrays_flags[0] = NPY_ITER_READONLY;
-    arrays_flags[1] = NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE;
-
-    // No inner iteration - inner loop is handled by CopyArray code
-    // Reference objects are OK.
-    int iter_flags = NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK;
-
-    // Construct the iterator
-    NpyIter *iter = NpyIter_MultiNew(
-            2,              // number of arrays
-            arrays,
-            iter_flags,
-            NPY_KEEPORDER,  // Maintain existing order for `array`
-            NPY_NO_CASTING, // Both arrays will have the same dtype so casting isn't needed or allowed
-            arrays_flags,
-            NULL);          // We don't have to specify dtypes since it will use array's
-
-    /* Per the documentation for NPY_ITER_REFS_OK:
-
-        Indicates that arrays with reference types (object arrays or structured arrays
-        containing an object type) may be accepted and used in the iterator. If this flag
-        is enabled, the caller must be sure to check whether NpyIter_IterationNeedsAPI(iter)
-        is true, in which case it may not release the GIL during iteration.
-
-        However, `NpyIter_IterationNeedsAPI` is not documented at all. So.......
-    */
-
-    if (iter == NULL) {
-        return NULL;
+    // If we don't need the GIL, iteration can be multi-threaded!
+    NPY_BEGIN_THREADS_DEF;
+    if (!NpyIter_IterationNeedsAPI(iter)) {
+        NPY_BEGIN_THREADS;
     }
 
-    NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL);
-    if (!iternext) {
-        NpyIter_Deallocate(iter);
-        return NULL;
-    }
-
-    char** dataptr = NpyIter_GetDataPtrArray(iter);
-    npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter);
-    npy_intp itemsize = NpyIter_GetDescrArray(iter)[0]->elsize;
-
     do {
         char* src_data = dataptr[0];
         char* dst_data = dataptr[1];
@@ -574,6 +340,8 @@ _roll_1d_d(PyArrayObject *array, int shift)
         }
     } while (iternext(iter));
 
+    NPY_END_THREADS;
+
     // Get the result from the iterator object array
     PyArrayObject *ret = NpyIter_GetOperandArray(iter)[1];
     if (!ret) {
@@ -593,21 +361,6 @@ _roll_1d_d(PyArrayObject *array, int shift)
 static PyObject *
 roll_1d(PyObject *Py_UNUSED(m), PyObject *args)
 {
-    /* Algorithm.
-
-        size = len(array)
-        if size <= 1:
-            return array.copy()
-
-        shift = shift % size
-        if shift == 0:
-            return array.copy()
-
-        post = np.empty(size, dtype=array.dtype)
-        post[0:shift] = array[-shift:]
-        post[shift:] = array[0:-shift]
-        return post
-    */
     PyArrayObject *array;
     int shift;
 
@@ -633,419 +386,9 @@ roll_1d(PyObject *Py_UNUSED(m), PyObject *args)
         return copy;
     }
 
-    // Silence UnuSEd fUnCTioN warnings.
-    if (0) {
-        return _roll_1d_a(array, shift);       // Basically the same
-        return _roll_1d_b(array, shift, size); // Way slower
-        return _roll_1d_c(array, shift);       // Faster for primitives, same for objects
-    }
-    return _roll_1d_d(array, shift);         // Faster for primitives & objects!
+    return _roll_1d(array, shift);
 }
 
-// -----------------------------------------------------------------------------
-
-static PyObject *
-_roll_2d_a(PyArrayObject *array, npy_uint shift, int axis)
-{
-    /*
-    if axis == 0: # roll rows
-        post[0:shift, :] = array[-shift:, :]
-        post[shift:, :] = array[0:-shift, :]
-        return post
-
-    # roll columns
-    post[:, 0:shift] = array[:, -shift:]
-    post[:, shift:] = array[:, 0:-shift]
-    */
-    // Tell the constructor to automatically allocate the output.
-    // The data type of the output will match that of the input.
-    PyArrayObject *arrays[2];
-    npy_uint32 arrays_flags[2];
-    arrays[0] = array;
-    arrays[1] = NULL;
-    arrays_flags[0] = NPY_ITER_READONLY;
-    arrays_flags[1] = NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE;
-
-    // No inner iteration - inner loop is handled by CopyArray code
-    // Reference objects are OK.
-    int iter_flags = NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK;
-
-    // Construct the iterator
-    NpyIter *iter = NpyIter_MultiNew(
-            2,              // number of arrays
-            arrays,
-            iter_flags,
-            NPY_KEEPORDER,
-            NPY_NO_CASTING, // Both arrays will have the same dtype so casting isn't needed or allowed
-            arrays_flags,
-            NULL);          // We don't have to specify dtypes since it will use array's
-
-    /* Per the documentation for NPY_ITER_REFS_OK:
-
-        Indicates that arrays with reference types (object arrays or structured arrays
-        containing an object type) may be accepted and used in the iterator. If this flag
-        is enabled, the caller must be sure to check whether NpyIter_IterationNeedsAPI(iter)
-        is true, in which case it may not release the GIL during iteration.
-
-        However, `NpyIter_IterationNeedsAPI` is not documented at all. So.......
-    */
-
-    if (iter == NULL) {
-        return NULL;
-    }
-
-    NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL);
-    if (!iternext) {
-        NpyIter_Deallocate(iter);
-        return NULL;
-    }
-
-    char** dataptr = NpyIter_GetDataPtrArray(iter);
-    npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter);
-    npy_intp itemsize = NpyIter_GetDescrArray(iter)[0]->elsize;
-
-    npy_uint NUM_ROWS = (npy_uint)PyArray_DIM(array, 0);
-    npy_uint rowsize  = (npy_uint)PyArray_DIM(array, 1);
-    npy_uint bytes_in_row = rowsize * itemsize;
-
-    do {
-        char *src_data = dataptr[0];
-        char *dst_data = dataptr[1];
-        npy_intp size = *sizeptr;
-        npy_uint total_bytes = size * itemsize;
-
-        if (axis == 0) {
-            /*
-            Shift by rows! This is the easy case.
-
-            Imagine we have this array:
-            [0 1 2]
-            [3 4 5]
-            [6 7 8]
-
-            In memory, this is stored contiguously as: [0 1 2 3 4 5 6 7 8]
-            Placing parentheses, we can visualize where the rows are like so:
-                [(0 1 2) (3 4 5) (6 7 8)]
-
-            Given this, all we are concerned about is two contiguous blocks of memory.
-
-            For example, if shift = -1, we can copy from row[1] -> END to the front
-
-            source = [(0 1 2) (3 4 5) (6 7 8)]
-                               | | |   | | |
-                         -----------------
-                       | | |   | | |
-                       V V V   V V V
-            buffer = [(3 4 5) (6 7 8) (X X X)]
-
-            Now, we fill in the missing tail bytes with row[0] from the src buffer
-
-            source = [(0 1 2) (3 4 5) (6 7 8)]
-                       | | |
-                         -----------------
-                                       | | |
-                                       V V V
-            buffer = [(3 4 5) (6 7 8) (0 1 2)]
-
-            Now, our internal memory represents the result of a row shift.
-            We can see this if we represent the final buffer as a 2D grid:
-
-            [3 4 5]
-            [6 7 8]
-            [0 1 2]
-            */
-
-            // Easiest case! Merely shift the rows
-            npy_intp offset = (NUM_ROWS - shift) * bytes_in_row;
-            npy_intp chunksize = total_bytes - offset;
-
-            memcpy(dst_data, src_data + offset, chunksize);
-            memcpy(dst_data + chunksize, src_data, offset);
-        }
-        else {
-            /*
-            Shift by columns! This is the more difficult case.
-
-            Let's use a slightly different array
-            [0 1 2 3 4]
-            [5 6 7 8 9]
-            [A B C D E]
-
-            If we shift by 2, our goal array will be:
-            [3 4 0 1 2]
-            [8 9 5 6 7]
-            [D E A B C]
-
-            Alternatively, we want our contiguous memory to go from:
-
-            source = [(0 1 2 3 4) (5 6 7 8 9) (A B C D E)]
-            buffer = [(3 4 0 1 2) (8 9 5 6 7) (D E A B C)]
-
-            In order to do this as efficiently as possible, we first fill the result buffer with the source shifted.
-
-            source = [(0 1 2 3 4) (5 6 7 8 9) (A B C D E)]
-                        \ \ \ \ \   \ \ \ \ \   \ \ \
-                         \ \ \  ----  \ \ \ ----  \ \ \
-                          \ \ \    \ \ \ \ \   \ \ \ \ \
-            buffer = [(X X 0 1 2) (3 4 5 6 7) (8 9 A B C)]
-
-            Now, all that's left is to fix the incorrect values
-
-            buffer = [(X X 0 1 2) (3 4 5 6 7) (8 9 A B C)]
-                       ^ ^         ^ ^         ^ ^
-
-            We can fill these by copying the values from each row
-
-            source = [(0 1 2 3 4) (5 6 7 8 9) (A B C D E)]
-                             | |         | |         | |
-                        -------     -------     -------
-                       | |         | |         | |
-                       V V         V V         V V
-            buffer = [(3 4 0 1 2) (8 9 5 6 7) (D E A B C)]
-
-            Now, our internal memory represents the result of a row shift.
-            We can see this if we represent the final buffer as a 2D grid:
-
-            [3 4 0 1 2]
-            [8 9 5 6 7]
-            [D E A B C]
-            */
-            if (shift > rowsize / 2) {
-                /* SHIFT LEFT
-
-                This branch is optimized for cases where the offset is greater than half of the columns.
-
-                For this, instead of shifting right and being forced to fill in a large section for each row,
-                we shift left and only have to fill in small section
-
-                Example: Shift by 4
-
-                Inefficient
-                [0 1 2 3 4]   [0 1 2 3 4]
-                 \               | | | |
-                  ------        -------
-                         \     | | | |
-                         V     V V V V
-                [X X X X 0]   [1 2 3 4 0]
-
-                Efficient
-                [0 1 2 3 4]   [0 1 2 3 4]
-                  / / / /      |
-                  | | | |       -------
-                  | | | |              |
-                 / / / /               V
-                [1 2 3 4 X]   [1 2 3 4 0]
-                */
-                npy_intp offset = (rowsize - shift) * itemsize;
-                npy_intp num_bytes = total_bytes - offset;
-                memcpy(dst_data, src_data + offset, num_bytes);
-
-                num_bytes = offset; // This is how much we need to copy for each column.
-
-                // Update the shifted portion of each row.
-                for (size_t i = 0; i < NUM_ROWS; ++i) {
-                    npy_intp row_offset = i * bytes_in_row;
-
-                    // We need to fill in the rightmost values of this row since we shifted by an offset
-                    npy_intp dst_offset = row_offset + bytes_in_row - num_bytes;
-                    npy_intp src_offset = row_offset;
-
-                    memcpy(dst_data + dst_offset, src_data + src_offset, num_bytes);
-                }
-            }
-            else {
-                // SHIFT RIGHT
-                npy_intp offset = shift * itemsize;
-                npy_intp num_bytes = total_bytes - offset;
-                memcpy(dst_data+offset, src_data, num_bytes);
-
-                num_bytes = offset; // This is how much we need to copy for each column.
-
-                // Update the shifted portion of each row.
-                for (size_t i = 0; i < NUM_ROWS; ++i) {
-                    npy_intp row_offset = i * bytes_in_row;
-
-                    // We need to fill in the leftmost values of this row since we shifted by an offset
-                    npy_intp dst_offset = row_offset;
-                    npy_intp src_offset = row_offset + ((rowsize - shift) * itemsize);
-
-                    memcpy(dst_data + dst_offset, src_data + src_offset, num_bytes);
-                }
-            }
-        }
-    } while (iternext(iter));
-
-    // Get the result from the iterator object array
-    PyArrayObject *ret = NpyIter_GetOperandArray(iter)[1];
-    if (!ret) {
-        NpyIter_Deallocate(iter);
-        return NULL;
-    }
-    Py_INCREF(ret);
-
-    if (NpyIter_Deallocate(iter) != NPY_SUCCEED) {
-        Py_DECREF(ret);
-        return NULL;
-    }
-
-    return (PyObject*)ret;
-}
-
-static PyObject *
-_roll_2d_b(PyArrayObject *array, npy_uint shift, int axis)
-{
-    // Create an empty array
-    PyArray_Descr* dtype = PyArray_DESCR(array);
-    Py_INCREF(dtype); // PyArray_Empty steals a reference to dtype
-
-    PyArrayObject* post = (PyArrayObject*)PyArray_Empty(
-                PyArray_NDIM(array),
-                PyArray_DIMS(array),
-                dtype,
-                0);
-    if (!post) {
-        return NULL;
-    }
-
-    npy_intp itemsize = dtype->elsize;
-    npy_intp size = PyArray_SIZE(array);
-
-    char* src_data = PyArray_BYTES(array);
-    char* dst_data = PyArray_BYTES(post);
-
-    npy_uint NUM_ROWS = (npy_uint)PyArray_DIM(array, 0);
-    npy_uint rowsize  = (npy_uint)PyArray_DIM(array, 1);
-    npy_uint bytes_in_row = rowsize * itemsize;
-    npy_uint total_bytes = size * itemsize;
-
-    if (axis == 0) {
-        // Easiest case! Merely shift the rows
-        npy_intp offset = (NUM_ROWS - shift) * bytes_in_row;
-        npy_intp chunksize = total_bytes - offset;
-
-        memcpy(dst_data, src_data + offset, chunksize);
-        memcpy(dst_data + chunksize, src_data, offset);
-    }
-    else {
-        if (shift > rowsize / 2) {
-            // SHIFT LEFT
-            npy_intp offset = (rowsize - shift) * itemsize;
-            npy_intp num_bytes = total_bytes - offset;
-            memcpy(dst_data, src_data + offset, num_bytes);
-
-            num_bytes = offset; // This is how much we need to copy for each column.
-
-            // Update the shifted portion of each row.
-            for (size_t i = 0; i < NUM_ROWS; ++i) {
-                npy_intp row_offset = i * bytes_in_row;
-
-                // We need to fill in the rightmost values of this row since we shifted by an offset
-                npy_intp dst_offset = row_offset + bytes_in_row - num_bytes;
-                npy_intp src_offset = row_offset;
-
-                memcpy(dst_data + dst_offset, src_data + src_offset, num_bytes);
-            }
-        }
-        else {
-            // SHIFT RIGHT
-            npy_intp offset = shift * itemsize;
-            npy_intp num_bytes = total_bytes - offset;
-            memcpy(dst_data + offset, src_data, num_bytes);
-
-            num_bytes = offset; // This is how much we need to copy for each column.
-
-            // Update the shifted portion of each row.
-            for (size_t i = 0; i < NUM_ROWS; ++i) {
-                npy_intp row_offset = i * bytes_in_row;
-
-                // We need to fill in the leftmost values of this row since we shifted by an offset
-                npy_intp dst_offset = row_offset;
-                npy_intp src_offset = row_offset + ((rowsize - shift) * itemsize);
-
-                memcpy(dst_data + dst_offset, src_data + src_offset, num_bytes);
-            }
-        }
-    }
-
-    return (PyObject*)post;
-}
-
-static PyObject *
-roll_2d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
-{
-    /* Algorithm.
-
-        size = array.shape[axis]
-
-        if shift != 0:
-            shift = shift % size
-
-        if size <= 1 or shift == 0:
-            return array.copy()
-
-        if shift < 0:
-            shift = size + shift
-
-        if axis == 0:
-            post[0:shift, :] = array[-shift:, :]
-            post[shift:, :] = array[0:-shift, :]
-            return post
-
-        post[:, 0:shift] = array[:, -shift:]
-        post[:, shift:] = array[:, 0:-shift]
-        return post
-    */
-    PyArrayObject *array;
-    int shift;
-    int axis; // npy_intp
-
-    static char *kwlist[] = {"array", "shift", "axis", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!ii:roll_1d",
-                                     kwlist,
-                                     &PyArray_Type, &array,
-                                     &shift, &axis))
-    {
-        return NULL;
-    }
-
-    if (axis != 0 && axis != 1) {
-        PyErr_SetString(PyExc_ValueError, "Axis must be 0 or 1");
-        return NULL;
-    }
-
-    if (PyArray_NDIM(array) != 2) {
-        PyErr_SetString(PyExc_ValueError, "Array must be 2D");
-        return NULL;
-    }
-
-    // Must be signed in order for modulo to work properly for negative shift values
-    int size = (int)PyArray_DIM(array, axis);
-
-    npy_uint8 is_empty = (size == 0);
-
-    if (!is_empty) {
-        shift = shift % size;
-        if (shift < 0) {
-            shift = size + shift;
-        }
-    }
-
-    if (is_empty || (shift == 0)) {
-        PyObject* copy = PyArray_Copy(array);
-        if (!copy) {
-            return NULL;
-        }
-        return copy;
-    }
-
-    if (0) {
-        return _roll_2d_a(array, (npy_uint)shift, axis);
-    }
-    return _roll_2d_b(array, (npy_uint)shift, axis);
-}
-
-
 //------------------------------------------------------------------------------
 // ArrayGO
 //------------------------------------------------------------------------------
@@ -1323,7 +666,6 @@ static PyMethodDef arraykit_methods[] =  {
     {"resolve_dtype", resolve_dtype, METH_VARARGS, NULL},
     {"resolve_dtype_iter", resolve_dtype_iter, METH_O, NULL},
     {"roll_1d", roll_1d, METH_VARARGS, NULL},
-    {"roll_2d", (PyCFunction)roll_2d, METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL},
 };
 
diff --git a/arraykit.pyi b/arraykit.pyi
index 9ede4fc0..28c787db 100644
--- a/arraykit.pyi
+++ b/arraykit.pyi
@@ -28,4 +28,3 @@ def row_1d_filter(__array: np.array) -> np.ndarray: ...
 def resolve_dtype(__d1: np.dtype, __d2: np.dtype) -> np.dtype: ...
 def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ...
 def roll_1d(__array: np.ndarray, __shift: int) -> np.ndarray: ...
-def roll_2d(__array: np.ndarray, __shift: int, __axis: int) -> np.ndarray: ...
diff --git a/performance/reference/util.py b/performance/reference/util.py
index a9c4a8f9..8a566a01 100644
--- a/performance/reference/util.py
+++ b/performance/reference/util.py
@@ -201,40 +201,3 @@ def roll_1d(array: np.ndarray, shift: int) -> np.ndarray:
     post[0:shift] = array[-shift:]
     post[shift:] = array[0:-shift]
     return post
-
-
-def roll_2d(array: np.ndarray, shift: int, axis: int) -> np.ndarray:
-    '''
-    Specialized form of np.roll that, by focusing on the 2D solution
-    '''
-    post = np.empty(array.shape, dtype=array.dtype)
-
-    if axis == 0: # roll rows
-        size = array.shape[0]
-        if size <= 1:
-            return array.copy()
-
-        # result will be positive
-        shift = shift % size
-        if shift == 0:
-            return array.copy()
-
-        post[0:shift, :] = array[-shift:, :]
-        post[shift:, :] = array[0:-shift, :]
-        return post
-
-    elif axis == 1: # roll columns
-        size = array.shape[1]
-        if size <= 1:
-            return array.copy()
-
-        # result will be positive
-        shift = shift % size
-        if shift == 0:
-            return array.copy()
-
-        post[:, 0:shift] = array[:, -shift:]
-        post[:, shift:] = array[:, 0:-shift]
-        return post
-
-    raise NotImplementedError()
diff --git a/test/test_util.py b/test/test_util.py
index 75784466..5b813790 100644
--- a/test/test_util.py
+++ b/test/test_util.py
@@ -14,7 +14,6 @@
 from performance.reference.util import mloc as mloc_ref
 #from performance.reference.util import roll_1d
 from arraykit import roll_1d
-from arraykit import roll_2d
 
 
 class TestUnit(unittest.TestCase):
@@ -191,75 +190,6 @@ def test_roll_1d_c(self) -> None:
         self.assertEqual(roll_1d(a1, 1).tolist(), [6, 3, 4, 5])
         self.assertEqual(roll_1d(a1, -1).tolist(), [4, 5, 6, 3])
 
-    #---------------------------------------------------------------------------
-    def test_roll_2d_row(self) -> None:
-        arr = np.arange(15).reshape(3, 5)
-
-        for shift in range(-10, 10):
-            np_result = np.roll(arr, shift, axis=0)
-            ak_result = roll_2d(arr, shift, axis=0)
-            self.assertTrue((np_result == ak_result).all())
-
-    def test_roll_2d_col(self) -> None:
-        arr = np.arange(15).reshape(3, 5)
-
-        for shift in range(-10, 10):
-            np_result = np.roll(arr, shift, axis=1)
-            ak_result = roll_2d(arr, shift, axis=1)
-            self.assertTrue((np_result == ak_result).all())
-
-    def test_roll_2d_a(self) -> None:
-        a1 = np.arange(12).reshape((3,4))
-
-        for i in range(a1.shape[0] + 1):
-            post = roll_2d(a1, i, axis=0)
-            self.assertEqual(post.tolist(), np.roll(a1, i, axis=0).tolist())
-
-            post = roll_2d(a1, -i, axis=0)
-            self.assertEqual(post.tolist(), np.roll(a1, -i, axis=0).tolist())
-
-        for i in range(a1.shape[1] + 1):
-            post = roll_2d(a1, i, axis=1)
-            self.assertEqual(post.tolist(), np.roll(a1, i, axis=1).tolist())
-
-            post = roll_2d(a1, -i, axis=1)
-            self.assertEqual(post.tolist(), np.roll(a1, -i, axis=1).tolist())
-
-    def test_roll_2d_b(self) -> None:
-        post = roll_2d(np.array([[]]), -4, axis=1)
-        self.assertEqual(post.shape, (1, 0))
-
-    def test_roll_2d_c(self) -> None:
-        a1 = np.arange(12).reshape((3,4))
-
-        self.assertEqual(roll_2d(a1, -2, axis=0).tolist(),
-                [[8, 9, 10, 11], [0, 1, 2, 3], [4, 5, 6, 7]])
-
-        self.assertEqual(roll_2d(a1, -2, axis=1).tolist(),
-                [[2, 3, 0, 1], [6, 7, 4, 5], [10, 11, 8, 9]])
-
-    def test_roll_2d_d(self) -> None:
-        a1 = np.arange(6).reshape((2, 3))
-
-        self.assertEqual(roll_2d(a1, 1, axis=1).tolist(),
-                [[2, 0, 1], [5, 3, 4]])
-        self.assertEqual(roll_2d(a1, -1, axis=1).tolist(),
-                [[1, 2, 0], [4, 5, 3]])
-
-    def test_roll_2d_e(self) -> None:
-        a1 = np.arange(6).reshape((3, 2))
-
-        self.assertEqual(roll_2d(a1, 1, axis=0).tolist(),
-                [[4, 5], [0, 1], [2, 3]]
-                )
-        self.assertEqual(roll_2d(a1, -1, axis=0).tolist(),
-                [[2, 3], [4, 5], [0, 1]]
-                )
-
-    def test_roll_2d_f(self) -> None:
-        with self.assertRaises(ValueError):
-            roll_2d(np.arange(4).reshape((2, 2)), 1, axis=2)
-
 
 if __name__ == '__main__':
     unittest.main()

From aa8eb1bb7be3c174c145fbc0e1e44c78c4c9d9b0 Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Mon, 26 Apr 2021 10:39:47 -0700
Subject: [PATCH 14/15] Removes some test code.

---
 test/test_util.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/test_util.py b/test/test_util.py
index 791a771c..04a23745 100644
--- a/test/test_util.py
+++ b/test/test_util.py
@@ -11,10 +11,9 @@
 from arraykit import mloc
 from arraykit import immutable_filter
 from arraykit import array_deepcopy
+from arraykit import roll_1d
 
 from performance.reference.util import mloc as mloc_ref
-#from performance.reference.util import roll_1d
-from arraykit import roll_1d
 
 
 class TestUnit(unittest.TestCase):

From cd046b3c31d167f277ce2c70bc693240a62d8de5 Mon Sep 17 00:00:00 2001
From: Charles Burkland <burkland@rallc.com>
Date: Mon, 21 Jun 2021 15:06:40 -0700
Subject: [PATCH 15/15] Fixes mistakes introduced by merge conflicts.

---
 performance/__main__.py | 12 ++++++------
 src/__init__.py         |  1 +
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/performance/__main__.py b/performance/__main__.py
index c182e787..ac9d8018 100644
--- a/performance/__main__.py
+++ b/performance/__main__.py
@@ -374,7 +374,7 @@ class Roll1d20kInt(Perf):
     NUMBER = 10
     SIZE = 20_000
 
-    def pre(self):
+    def __init__(self):
         self.array = np.arange(self.SIZE)
 
     def main(self):
@@ -385,7 +385,7 @@ class Roll1d20kFloat(Perf):
     NUMBER = 10
     SIZE = 20_000
 
-    def pre(self):
+    def __init__(self):
         self.array = np.arange(self.SIZE).astype(float)
 
     def main(self):
@@ -396,7 +396,7 @@ class Roll1d20kObject(Perf):
     NUMBER = 2
     SIZE = 20_000
 
-    def pre(self):
+    def __init__(self):
         self.array = np.arange(self.SIZE).astype(object)
 
     def main(self):
@@ -407,7 +407,7 @@ class Roll1d1kInt(Perf):
     NUMBER = 10
     SIZE = 1_000
 
-    def pre(self):
+    def __init__(self):
         self.array = np.arange(self.SIZE)
 
     def main(self):
@@ -418,7 +418,7 @@ class Roll1d1kFloat(Perf):
     NUMBER = 10
     SIZE = 1_000
 
-    def pre(self):
+    def __init__(self):
         self.array = np.arange(self.SIZE).astype(float)
 
     def main(self):
@@ -429,7 +429,7 @@ class Roll1d1kObject(Perf):
     NUMBER = 10
     SIZE = 1_000
 
-    def pre(self):
+    def __init__(self):
         self.array = np.arange(self.SIZE).astype(object)
 
     def main(self):
diff --git a/src/__init__.py b/src/__init__.py
index 988ca110..52945c33 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -16,3 +16,4 @@
 from ._arraykit import resolve_dtype_iter as resolve_dtype_iter
 from ._arraykit import isna_element as isna_element
 from ._arraykit import dtype_from_element as dtype_from_element
+from ._arraykit import roll_1d as roll_1d