From 33b48fefb2ee2795e4b8b598a864480f5028cfdb Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Wed, 16 Jul 2025 12:04:46 +0000 Subject: [PATCH 01/20] umath refactor --- quaddtype/meson.build | 13 +- .../numpy_quaddtype/src/quaddtype_main.c | 2 +- quaddtype/numpy_quaddtype/src/umath.cpp | 806 ------------------ .../numpy_quaddtype/src/umath/binary_ops.cpp | 235 +++++ .../numpy_quaddtype/src/umath/binary_ops.h | 9 + .../src/umath/comparison_ops.cpp | 240 ++++++ .../src/umath/comparison_ops.h | 9 + .../numpy_quaddtype/src/umath/promoters.hpp | 90 ++ quaddtype/numpy_quaddtype/src/umath/umath.cpp | 115 +++ .../numpy_quaddtype/src/{ => umath}/umath.h | 2 +- .../numpy_quaddtype/src/umath/unary_ops.cpp | 214 +++++ .../numpy_quaddtype/src/umath/unary_ops.h | 9 + 12 files changed, 933 insertions(+), 811 deletions(-) delete mode 100644 quaddtype/numpy_quaddtype/src/umath.cpp create mode 100644 quaddtype/numpy_quaddtype/src/umath/binary_ops.cpp create mode 100644 quaddtype/numpy_quaddtype/src/umath/binary_ops.h create mode 100644 quaddtype/numpy_quaddtype/src/umath/comparison_ops.cpp create mode 100644 quaddtype/numpy_quaddtype/src/umath/comparison_ops.h create mode 100644 quaddtype/numpy_quaddtype/src/umath/promoters.hpp create mode 100644 quaddtype/numpy_quaddtype/src/umath/umath.cpp rename quaddtype/numpy_quaddtype/src/{ => umath}/umath.h (95%) create mode 100644 quaddtype/numpy_quaddtype/src/umath/unary_ops.cpp create mode 100644 quaddtype/numpy_quaddtype/src/umath/unary_ops.h diff --git a/quaddtype/meson.build b/quaddtype/meson.build index d1c67990..66318d60 100644 --- a/quaddtype/meson.build +++ b/quaddtype/meson.build @@ -50,12 +50,19 @@ srcs = [ 'numpy_quaddtype/src/scalar_ops.h', 'numpy_quaddtype/src/scalar_ops.cpp', 'numpy_quaddtype/src/ops.hpp', - 'numpy_quaddtype/src/umath.h', - 'numpy_quaddtype/src/umath.cpp', 'numpy_quaddtype/src/dragon4.h', 'numpy_quaddtype/src/dragon4.c', 'numpy_quaddtype/src/quadblas_interface.h', - 'numpy_quaddtype/src/quadblas_interface.cpp' + 'numpy_quaddtype/src/quadblas_interface.cpp', + 'numpy_quaddtype/src/umath/umath.h', + 'numpy_quaddtype/src/umath/umath.cpp', + 'numpy_quaddtype/src/umath/binary_ops.h', + 'numpy_quaddtype/src/umath/binary_ops.cpp', + 'numpy_quaddtype/src/umath/unary_ops.h', + 'numpy_quaddtype/src/umath/unary_ops.cpp', + 'numpy_quaddtype/src/umath/comparison_ops.h', + 'numpy_quaddtype/src/umath/comparison_ops.cpp', + 'numpy_quaddtype/src/umath/promoters.hpp', ] py.install_sources( diff --git a/quaddtype/numpy_quaddtype/src/quaddtype_main.c b/quaddtype/numpy_quaddtype/src/quaddtype_main.c index 9e2b8430..641200d3 100644 --- a/quaddtype/numpy_quaddtype/src/quaddtype_main.c +++ b/quaddtype/numpy_quaddtype/src/quaddtype_main.c @@ -14,7 +14,7 @@ #include "scalar.h" #include "dtype.h" -#include "umath.h" +#include "umath/umath.h" #include "quad_common.h" #include "quadblas_interface.h" #include "float.h" diff --git a/quaddtype/numpy_quaddtype/src/umath.cpp b/quaddtype/numpy_quaddtype/src/umath.cpp deleted file mode 100644 index 0058236a..00000000 --- a/quaddtype/numpy_quaddtype/src/umath.cpp +++ /dev/null @@ -1,806 +0,0 @@ -#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API -#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API -#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION -#define NPY_TARGET_VERSION NPY_2_0_API_VERSION -#define NO_IMPORT_ARRAY -#define NO_IMPORT_UFUNC - -extern "C" { -#include -#include - -#include "numpy/arrayobject.h" -#include "numpy/ndarraytypes.h" -#include "numpy/ufuncobject.h" - -#include "numpy/dtype_api.h" -} -#include "quad_common.h" -#include "scalar.h" -#include "dtype.h" -#include "umath.h" -#include "ops.hpp" - -// helper debugging function -static const char * -get_dtype_name(PyArray_DTypeMeta *dtype) -{ - if (dtype == &QuadPrecDType) { - return "QuadPrecDType"; - } - else if (dtype == &PyArray_BoolDType) { - return "BoolDType"; - } - else if (dtype == &PyArray_ByteDType) { - return "ByteDType"; - } - else if (dtype == &PyArray_UByteDType) { - return "UByteDType"; - } - else if (dtype == &PyArray_ShortDType) { - return "ShortDType"; - } - else if (dtype == &PyArray_UShortDType) { - return "UShortDType"; - } - else if (dtype == &PyArray_IntDType) { - return "IntDType"; - } - else if (dtype == &PyArray_UIntDType) { - return "UIntDType"; - } - else if (dtype == &PyArray_LongDType) { - return "LongDType"; - } - else if (dtype == &PyArray_ULongDType) { - return "ULongDType"; - } - else if (dtype == &PyArray_LongLongDType) { - return "LongLongDType"; - } - else if (dtype == &PyArray_ULongLongDType) { - return "ULongLongDType"; - } - else if (dtype == &PyArray_FloatDType) { - return "FloatDType"; - } - else if (dtype == &PyArray_DoubleDType) { - return "DoubleDType"; - } - else if (dtype == &PyArray_LongDoubleDType) { - return "LongDoubleDType"; - } - else { - return "UnknownDType"; - } -} - -static NPY_CASTING -quad_unary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[], - PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[], - npy_intp *NPY_UNUSED(view_offset)) -{ - Py_INCREF(given_descrs[0]); - loop_descrs[0] = given_descrs[0]; - - if (given_descrs[1] == NULL) { - Py_INCREF(given_descrs[0]); - loop_descrs[1] = given_descrs[0]; - } - else { - Py_INCREF(given_descrs[1]); - loop_descrs[1] = given_descrs[1]; - } - - QuadPrecDTypeObject *descr_in = (QuadPrecDTypeObject *)given_descrs[0]; - QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)loop_descrs[1]; - - if (descr_in->backend != descr_out->backend) { - return NPY_UNSAFE_CASTING; - } - - return NPY_NO_CASTING; -} - -template -int -quad_generic_unary_op_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[], - npy_intp const dimensions[], npy_intp const strides[], - NpyAuxData *auxdata) -{ - npy_intp N = dimensions[0]; - char *in_ptr = data[0]; - char *out_ptr = data[1]; - npy_intp in_stride = strides[0]; - npy_intp out_stride = strides[1]; - - QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; - QuadBackendType backend = descr->backend; - size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double); - - quad_value in, out; - while (N--) { - memcpy(&in, in_ptr, elem_size); - if (backend == BACKEND_SLEEF) { - out.sleef_value = sleef_op(&in.sleef_value); - } - else { - out.longdouble_value = longdouble_op(&in.longdouble_value); - } - memcpy(out_ptr, &out, elem_size); - - in_ptr += in_stride; - out_ptr += out_stride; - } - return 0; -} - -template -int -quad_generic_unary_op_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[], - npy_intp const dimensions[], npy_intp const strides[], - NpyAuxData *auxdata) -{ - npy_intp N = dimensions[0]; - char *in_ptr = data[0]; - char *out_ptr = data[1]; - npy_intp in_stride = strides[0]; - npy_intp out_stride = strides[1]; - - QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; - QuadBackendType backend = descr->backend; - - while (N--) { - if (backend == BACKEND_SLEEF) { - *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in_ptr); - } - else { - *(long double *)out_ptr = longdouble_op((long double *)in_ptr); - } - in_ptr += in_stride; - out_ptr += out_stride; - } - return 0; -} - -template -int -create_quad_unary_ufunc(PyObject *numpy, const char *ufunc_name) -{ - PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name); - if (ufunc == NULL) { - return -1; - } - - PyArray_DTypeMeta *dtypes[2] = {&QuadPrecDType, &QuadPrecDType}; - - PyType_Slot slots[] = { - {NPY_METH_resolve_descriptors, (void *)&quad_unary_op_resolve_descriptors}, - {NPY_METH_strided_loop, - (void *)&quad_generic_unary_op_strided_loop_aligned}, - {NPY_METH_unaligned_strided_loop, - (void *)&quad_generic_unary_op_strided_loop_unaligned}, - {0, NULL}}; - - PyArrayMethod_Spec Spec = { - .name = "quad_unary_op", - .nin = 1, - .nout = 1, - .casting = NPY_NO_CASTING, - .flags = NPY_METH_SUPPORTS_UNALIGNED, - .dtypes = dtypes, - .slots = slots, - }; - - if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) { - return -1; - } - - return 0; -} - -int -init_quad_unary_ops(PyObject *numpy) -{ - if (create_quad_unary_ufunc(numpy, "negative") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "positive") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "absolute") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "rint") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "trunc") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "floor") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "ceil") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "sqrt") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "square") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "log") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "log2") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "log10") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "log1p") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "exp") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "exp2") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "sin") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "cos") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "tan") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "arcsin") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "arccos") < 0) { - return -1; - } - if (create_quad_unary_ufunc(numpy, "arctan") < 0) { - return -1; - } - return 0; -} - -// Binary ufuncs - -static NPY_CASTING -quad_binary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[], - PyArray_Descr *const given_descrs[], - PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset)) -{ - QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0]; - QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1]; - QuadBackendType target_backend; - - // Determine target backend and if casting is needed - NPY_CASTING casting = NPY_NO_CASTING; - if (descr_in1->backend != descr_in2->backend) { - target_backend = BACKEND_LONGDOUBLE; - casting = NPY_SAFE_CASTING; - } - else { - target_backend = descr_in1->backend; - } - - // Set up input descriptors, casting if necessary - for (int i = 0; i < 2; i++) { - if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) { - loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend); - if (!loop_descrs[i]) { - return (NPY_CASTING)-1; - } - } - else { - Py_INCREF(given_descrs[i]); - loop_descrs[i] = given_descrs[i]; - } - } - - // Set up output descriptor - if (given_descrs[2] == NULL) { - loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend); - if (!loop_descrs[2]) { - return (NPY_CASTING)-1; - } - } - else { - QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2]; - if (descr_out->backend != target_backend) { - loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend); - if (!loop_descrs[2]) { - return (NPY_CASTING)-1; - } - } - else { - Py_INCREF(given_descrs[2]); - loop_descrs[2] = given_descrs[2]; - } - } - return casting; -} - -template -int -quad_generic_binop_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[], - npy_intp const dimensions[], npy_intp const strides[], - NpyAuxData *auxdata) -{ - npy_intp N = dimensions[0]; - char *in1_ptr = data[0], *in2_ptr = data[1]; - char *out_ptr = data[2]; - npy_intp in1_stride = strides[0]; - npy_intp in2_stride = strides[1]; - npy_intp out_stride = strides[2]; - - QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; - QuadBackendType backend = descr->backend; - size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double); - - quad_value in1, in2, out; - while (N--) { - memcpy(&in1, in1_ptr, elem_size); - memcpy(&in2, in2_ptr, elem_size); - if (backend == BACKEND_SLEEF) { - out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value); - } - else { - out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value); - } - memcpy(out_ptr, &out, elem_size); - - in1_ptr += in1_stride; - in2_ptr += in2_stride; - out_ptr += out_stride; - } - return 0; -} - -template -int -quad_generic_binop_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[], - npy_intp const dimensions[], npy_intp const strides[], - NpyAuxData *auxdata) -{ - npy_intp N = dimensions[0]; - char *in1_ptr = data[0], *in2_ptr = data[1]; - char *out_ptr = data[2]; - npy_intp in1_stride = strides[0]; - npy_intp in2_stride = strides[1]; - npy_intp out_stride = strides[2]; - - QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; - QuadBackendType backend = descr->backend; - - while (N--) { - if (backend == BACKEND_SLEEF) { - *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr); - } - else { - *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr); - } - - in1_ptr += in1_stride; - in2_ptr += in2_stride; - out_ptr += out_stride; - } - return 0; -} - -static int -quad_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[], - PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[]) -{ - int nin = ufunc->nin; - int nargs = ufunc->nargs; - PyArray_DTypeMeta *common = NULL; - bool has_quad = false; - - // Handle the special case for reductions - if (op_dtypes[0] == NULL) { - assert(nin == 2 && ufunc->nout == 1); /* must be reduction */ - for (int i = 0; i < 3; i++) { - Py_INCREF(op_dtypes[1]); - new_op_dtypes[i] = op_dtypes[1]; - } - return 0; - } - - // Check if any input or signature is QuadPrecision - for (int i = 0; i < nin; i++) { - if (op_dtypes[i] == &QuadPrecDType) { - has_quad = true; - } - } - - if (has_quad) { - common = &QuadPrecDType; - } - else { - for (int i = nin; i < nargs; i++) { - if (signature[i] != NULL) { - if (common == NULL) { - Py_INCREF(signature[i]); - common = signature[i]; - } - else if (common != signature[i]) { - Py_CLEAR(common); // Not homogeneous, unset common - break; - } - } - } - } - // If no common output dtype, use standard promotion for inputs - if (common == NULL) { - common = PyArray_PromoteDTypeSequence(nin, op_dtypes); - if (common == NULL) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) { - PyErr_Clear(); // Do not propagate normal promotion errors - } - - return -1; - } - } - - // Set all new_op_dtypes to the common dtype - for (int i = 0; i < nargs; i++) { - if (signature[i]) { - // If signature is specified for this argument, use it - Py_INCREF(signature[i]); - new_op_dtypes[i] = signature[i]; - } - else { - // Otherwise, use the common dtype - Py_INCREF(common); - - new_op_dtypes[i] = common; - } - } - - Py_XDECREF(common); - - return 0; -} - -template -int -create_quad_binary_ufunc(PyObject *numpy, const char *ufunc_name) -{ - PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name); - if (ufunc == NULL) { - return -1; - } - - PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType}; - - PyType_Slot slots[] = { - {NPY_METH_resolve_descriptors, (void *)&quad_binary_op_resolve_descriptors}, - {NPY_METH_strided_loop, - (void *)&quad_generic_binop_strided_loop_aligned}, - {NPY_METH_unaligned_strided_loop, - (void *)&quad_generic_binop_strided_loop_unaligned}, - {0, NULL}}; - - PyArrayMethod_Spec Spec = { - .name = "quad_binop", - .nin = 2, - .nout = 1, - .casting = NPY_NO_CASTING, - .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE), - .dtypes = dtypes, - .slots = slots, - }; - - if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) { - return -1; - } - - PyObject *promoter_capsule = - PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL); - if (promoter_capsule == NULL) { - return -1; - } - - PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type); - if (DTypes == 0) { - Py_DECREF(promoter_capsule); - return -1; - } - - if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) { - Py_DECREF(promoter_capsule); - Py_DECREF(DTypes); - return -1; - } - Py_DECREF(promoter_capsule); - Py_DECREF(DTypes); - return 0; -} - -int -init_quad_binary_ops(PyObject *numpy) -{ - if (create_quad_binary_ufunc(numpy, "add") < 0) { - return -1; - } - if (create_quad_binary_ufunc(numpy, "subtract") < 0) { - return -1; - } - if (create_quad_binary_ufunc(numpy, "multiply") < 0) { - return -1; - } - if (create_quad_binary_ufunc(numpy, "divide") < 0) { - return -1; - } - if (create_quad_binary_ufunc(numpy, "power") < 0) { - return -1; - } - if (create_quad_binary_ufunc(numpy, "mod") < 0) { - return -1; - } - if (create_quad_binary_ufunc(numpy, "minimum") < 0) { - return -1; - } - if (create_quad_binary_ufunc(numpy, "maximum") < 0) { - return -1; - } - if (create_quad_binary_ufunc(numpy, "arctan2") < 0) { - return -1; - } - return 0; -} - -// comparison functions - -static NPY_CASTING -quad_comparison_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[], - PyArray_Descr *const given_descrs[], - PyArray_Descr *loop_descrs[], - npy_intp *NPY_UNUSED(view_offset)) -{ - QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0]; - QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1]; - QuadBackendType target_backend; - - // As dealing with different backends then cast to boolean - NPY_CASTING casting = NPY_NO_CASTING; - if (descr_in1->backend != descr_in2->backend) { - target_backend = BACKEND_LONGDOUBLE; - casting = NPY_SAFE_CASTING; - } - else { - target_backend = descr_in1->backend; - } - - // Set up input descriptors, casting if necessary - for (int i = 0; i < 2; i++) { - if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) { - loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend); - if (!loop_descrs[i]) { - return (NPY_CASTING)-1; - } - } - else { - Py_INCREF(given_descrs[i]); - loop_descrs[i] = given_descrs[i]; - } - } - - // Set up output descriptor - loop_descrs[2] = PyArray_DescrFromType(NPY_BOOL); - if (!loop_descrs[2]) { - return (NPY_CASTING)-1; - } - return casting; -} - -template -int -quad_generic_comp_strided_loop(PyArrayMethod_Context *context, char *const data[], - npy_intp const dimensions[], npy_intp const strides[], - NpyAuxData *auxdata) -{ - npy_intp N = dimensions[0]; - char *in1_ptr = data[0], *in2_ptr = data[1]; - char *out_ptr = data[2]; - npy_intp in1_stride = strides[0]; - npy_intp in2_stride = strides[1]; - npy_intp out_stride = strides[2]; - - QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; - QuadBackendType backend = descr->backend; - size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double); - - quad_value in1, in2; - while (N--) { - memcpy(&in1, in1_ptr, elem_size); - memcpy(&in2, in2_ptr, elem_size); - npy_bool result; - - if (backend == BACKEND_SLEEF) { - result = sleef_comp(&in1.sleef_value, &in2.sleef_value); - } - else { - result = ld_comp(&in1.longdouble_value, &in2.longdouble_value); - } - - memcpy(out_ptr, &result, sizeof(npy_bool)); - - in1_ptr += in1_stride; - in2_ptr += in2_stride; - out_ptr += out_stride; - } - return 0; -} - -template -int -quad_generic_comp_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[], - npy_intp const dimensions[], npy_intp const strides[], - NpyAuxData *auxdata) -{ - npy_intp N = dimensions[0]; - char *in1_ptr = data[0], *in2_ptr = data[1]; - char *out_ptr = data[2]; - npy_intp in1_stride = strides[0]; - npy_intp in2_stride = strides[1]; - npy_intp out_stride = strides[2]; - - QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; - QuadBackendType backend = descr->backend; - while (N--) { - quad_value in1 = *(quad_value *)in1_ptr; - quad_value in2 = *(quad_value *)in2_ptr; - - npy_bool result; - - if (backend == BACKEND_SLEEF) { - result = sleef_comp(&in1.sleef_value, &in2.sleef_value); - } - else { - result = ld_comp(&in1.longdouble_value, &in2.longdouble_value); - } - - *(npy_bool *)out_ptr = result; - - in1_ptr += in1_stride; - in2_ptr += in2_stride; - out_ptr += out_stride; - } - return 0; -} - -NPY_NO_EXPORT int -comparison_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[], - PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[]) -{ - PyArray_DTypeMeta *new_signature[NPY_MAXARGS]; - memcpy(new_signature, signature, 3 * sizeof(PyArray_DTypeMeta *)); - new_signature[2] = NULL; - int res = quad_ufunc_promoter(ufunc, op_dtypes, new_signature, new_op_dtypes); - if (res < 0) { - return -1; - } - Py_XSETREF(new_op_dtypes[2], &PyArray_BoolDType); - return 0; -} - -template -int -create_quad_comparison_ufunc(PyObject *numpy, const char *ufunc_name) -{ - PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name); - if (ufunc == NULL) { - return -1; - } - - PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &PyArray_BoolDType}; - - PyType_Slot slots[] = { - {NPY_METH_resolve_descriptors, (void *)&quad_comparison_op_resolve_descriptors}, - {NPY_METH_strided_loop, - (void *)&quad_generic_comp_strided_loop_aligned}, - {NPY_METH_unaligned_strided_loop, - (void *)&quad_generic_comp_strided_loop}, - {0, NULL}}; - - PyArrayMethod_Spec Spec = { - .name = "quad_comp", - .nin = 2, - .nout = 1, - .casting = NPY_SAFE_CASTING, - .flags = NPY_METH_SUPPORTS_UNALIGNED, - .dtypes = dtypes, - .slots = slots, - }; - - if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) { - return -1; - } - - PyObject *promoter_capsule = - PyCapsule_New((void *)&comparison_ufunc_promoter, "numpy._ufunc_promoter", NULL); - if (promoter_capsule == NULL) { - return -1; - } - - PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArray_BoolDType); - if (DTypes == 0) { - Py_DECREF(promoter_capsule); - return -1; - } - - if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) { - Py_DECREF(promoter_capsule); - Py_DECREF(DTypes); - return -1; - } - Py_DECREF(promoter_capsule); - Py_DECREF(DTypes); - - return 0; -} - -int -init_quad_comps(PyObject *numpy) -{ - if (create_quad_comparison_ufunc(numpy, "equal") < 0) { - return -1; - } - if (create_quad_comparison_ufunc(numpy, "not_equal") < 0) { - return -1; - } - if (create_quad_comparison_ufunc(numpy, "less") < 0) { - return -1; - } - if (create_quad_comparison_ufunc(numpy, "less_equal") < 0) { - return -1; - } - if (create_quad_comparison_ufunc(numpy, "greater") < 0) { - return -1; - } - if (create_quad_comparison_ufunc(numpy, "greater_equal") < - 0) { - return -1; - } - - return 0; -} - -int -init_quad_umath(void) -{ - PyObject *numpy = PyImport_ImportModule("numpy"); - if (!numpy) { - PyErr_SetString(PyExc_ImportError, "Failed to import numpy module"); - return -1; - } - - if (init_quad_unary_ops(numpy) < 0) { - PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad unary operations"); - goto err; - } - - if (init_quad_binary_ops(numpy) < 0) { - PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad binary operations"); - goto err; - } - - if (init_quad_comps(numpy) < 0) { - PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad comparison operations"); - goto err; - } - - Py_DECREF(numpy); - return 0; - -err: - Py_DECREF(numpy); - return -1; -} \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath/binary_ops.cpp b/quaddtype/numpy_quaddtype/src/umath/binary_ops.cpp new file mode 100644 index 00000000..aa6d19ce --- /dev/null +++ b/quaddtype/numpy_quaddtype/src/umath/binary_ops.cpp @@ -0,0 +1,235 @@ +#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API +#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API +#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION +#define NPY_TARGET_VERSION NPY_2_0_API_VERSION +#define NO_IMPORT_ARRAY +#define NO_IMPORT_UFUNC + + +#include +#include + +#include "numpy/arrayobject.h" +#include "numpy/ufuncobject.h" +#include "numpy/dtype_api.h" +#include "numpy/ndarraytypes.h" + +#include "../quad_common.h" +#include "../scalar.h" +#include "../dtype.h" +#include "../ops.hpp" +#include "promoters.hpp" +#include "binary_ops.h" + +static NPY_CASTING +quad_binary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[], + PyArray_Descr *const given_descrs[], + PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset)) +{ + QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0]; + QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1]; + QuadBackendType target_backend; + + // Determine target backend and if casting is needed + NPY_CASTING casting = NPY_NO_CASTING; + if (descr_in1->backend != descr_in2->backend) { + target_backend = BACKEND_LONGDOUBLE; + casting = NPY_SAFE_CASTING; + } + else { + target_backend = descr_in1->backend; + } + + // Set up input descriptors, casting if necessary + for (int i = 0; i < 2; i++) { + if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) { + loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend); + if (!loop_descrs[i]) { + return (NPY_CASTING)-1; + } + } + else { + Py_INCREF(given_descrs[i]); + loop_descrs[i] = given_descrs[i]; + } + } + + // Set up output descriptor + if (given_descrs[2] == NULL) { + loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend); + if (!loop_descrs[2]) { + return (NPY_CASTING)-1; + } + } + else { + QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2]; + if (descr_out->backend != target_backend) { + loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend); + if (!loop_descrs[2]) { + return (NPY_CASTING)-1; + } + } + else { + Py_INCREF(given_descrs[2]); + loop_descrs[2] = given_descrs[2]; + } + } + return casting; +} + +template +int +quad_generic_binop_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) +{ + npy_intp N = dimensions[0]; + char *in1_ptr = data[0], *in2_ptr = data[1]; + char *out_ptr = data[2]; + npy_intp in1_stride = strides[0]; + npy_intp in2_stride = strides[1]; + npy_intp out_stride = strides[2]; + + QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; + QuadBackendType backend = descr->backend; + size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double); + + quad_value in1, in2, out; + while (N--) { + memcpy(&in1, in1_ptr, elem_size); + memcpy(&in2, in2_ptr, elem_size); + if (backend == BACKEND_SLEEF) { + out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value); + } + else { + out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value); + } + memcpy(out_ptr, &out, elem_size); + + in1_ptr += in1_stride; + in2_ptr += in2_stride; + out_ptr += out_stride; + } + return 0; +} + +template +int +quad_generic_binop_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) +{ + npy_intp N = dimensions[0]; + char *in1_ptr = data[0], *in2_ptr = data[1]; + char *out_ptr = data[2]; + npy_intp in1_stride = strides[0]; + npy_intp in2_stride = strides[1]; + npy_intp out_stride = strides[2]; + + QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; + QuadBackendType backend = descr->backend; + + while (N--) { + if (backend == BACKEND_SLEEF) { + *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr); + } + else { + *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr); + } + + in1_ptr += in1_stride; + in2_ptr += in2_stride; + out_ptr += out_stride; + } + return 0; +} + + +template +int +create_quad_binary_ufunc(PyObject *numpy, const char *ufunc_name) +{ + PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name); + if (ufunc == NULL) { + return -1; + } + + PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType}; + + PyType_Slot slots[] = { + {NPY_METH_resolve_descriptors, (void *)&quad_binary_op_resolve_descriptors}, + {NPY_METH_strided_loop, + (void *)&quad_generic_binop_strided_loop_aligned}, + {NPY_METH_unaligned_strided_loop, + (void *)&quad_generic_binop_strided_loop_unaligned}, + {0, NULL}}; + + PyArrayMethod_Spec Spec = { + .name = "quad_binop", + .nin = 2, + .nout = 1, + .casting = NPY_NO_CASTING, + .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE), + .dtypes = dtypes, + .slots = slots, + }; + + if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) { + return -1; + } + + PyObject *promoter_capsule = + PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL); + if (promoter_capsule == NULL) { + return -1; + } + + PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type); + if (DTypes == 0) { + Py_DECREF(promoter_capsule); + return -1; + } + + if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) { + Py_DECREF(promoter_capsule); + Py_DECREF(DTypes); + return -1; + } + Py_DECREF(promoter_capsule); + Py_DECREF(DTypes); + return 0; +} + + +int +init_quad_binary_ops(PyObject *numpy) +{ + if (create_quad_binary_ufunc(numpy, "add") < 0) { + return -1; + } + if (create_quad_binary_ufunc(numpy, "subtract") < 0) { + return -1; + } + if (create_quad_binary_ufunc(numpy, "multiply") < 0) { + return -1; + } + if (create_quad_binary_ufunc(numpy, "divide") < 0) { + return -1; + } + if (create_quad_binary_ufunc(numpy, "power") < 0) { + return -1; + } + if (create_quad_binary_ufunc(numpy, "mod") < 0) { + return -1; + } + if (create_quad_binary_ufunc(numpy, "minimum") < 0) { + return -1; + } + if (create_quad_binary_ufunc(numpy, "maximum") < 0) { + return -1; + } + if (create_quad_binary_ufunc(numpy, "arctan2") < 0) { + return -1; + } + return 0; +} \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath/binary_ops.h b/quaddtype/numpy_quaddtype/src/umath/binary_ops.h new file mode 100644 index 00000000..c45ec453 --- /dev/null +++ b/quaddtype/numpy_quaddtype/src/umath/binary_ops.h @@ -0,0 +1,9 @@ +#ifndef _QUADDTYPE_BINARY_OPS_H +#define _QUADDTYPE_BINARY_OPS_H + +#include + +int +init_quad_binary_ops(PyObject *numpy); + +#endif \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath/comparison_ops.cpp b/quaddtype/numpy_quaddtype/src/umath/comparison_ops.cpp new file mode 100644 index 00000000..095b6d39 --- /dev/null +++ b/quaddtype/numpy_quaddtype/src/umath/comparison_ops.cpp @@ -0,0 +1,240 @@ +#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API +#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API +#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION +#define NPY_TARGET_VERSION NPY_2_0_API_VERSION +#define NO_IMPORT_ARRAY +#define NO_IMPORT_UFUNC + + +#include +#include + +#include "numpy/arrayobject.h" +#include "numpy/ufuncobject.h" +#include "numpy/dtype_api.h" +#include "numpy/ndarraytypes.h" + +#include "../quad_common.h" +#include "../scalar.h" +#include "../dtype.h" +#include "umath.h" +#include "../ops.hpp" +#include "promoters.hpp" +#include "binary_ops.h" +#include "comparison_ops.h" + + +static NPY_CASTING +quad_comparison_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[], + PyArray_Descr *const given_descrs[], + PyArray_Descr *loop_descrs[], + npy_intp *NPY_UNUSED(view_offset)) +{ + QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0]; + QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1]; + QuadBackendType target_backend; + + // As dealing with different backends then cast to boolean + NPY_CASTING casting = NPY_NO_CASTING; + if (descr_in1->backend != descr_in2->backend) { + target_backend = BACKEND_LONGDOUBLE; + casting = NPY_SAFE_CASTING; + } + else { + target_backend = descr_in1->backend; + } + + // Set up input descriptors, casting if necessary + for (int i = 0; i < 2; i++) { + if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) { + loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend); + if (!loop_descrs[i]) { + return (NPY_CASTING)-1; + } + } + else { + Py_INCREF(given_descrs[i]); + loop_descrs[i] = given_descrs[i]; + } + } + + // Set up output descriptor + loop_descrs[2] = PyArray_DescrFromType(NPY_BOOL); + if (!loop_descrs[2]) { + return (NPY_CASTING)-1; + } + return casting; +} + +template +int +quad_generic_comp_strided_loop(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) +{ + npy_intp N = dimensions[0]; + char *in1_ptr = data[0], *in2_ptr = data[1]; + char *out_ptr = data[2]; + npy_intp in1_stride = strides[0]; + npy_intp in2_stride = strides[1]; + npy_intp out_stride = strides[2]; + + QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; + QuadBackendType backend = descr->backend; + size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double); + + quad_value in1, in2; + while (N--) { + memcpy(&in1, in1_ptr, elem_size); + memcpy(&in2, in2_ptr, elem_size); + npy_bool result; + + if (backend == BACKEND_SLEEF) { + result = sleef_comp(&in1.sleef_value, &in2.sleef_value); + } + else { + result = ld_comp(&in1.longdouble_value, &in2.longdouble_value); + } + + memcpy(out_ptr, &result, sizeof(npy_bool)); + + in1_ptr += in1_stride; + in2_ptr += in2_stride; + out_ptr += out_stride; + } + return 0; +} + +template +int +quad_generic_comp_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) +{ + npy_intp N = dimensions[0]; + char *in1_ptr = data[0], *in2_ptr = data[1]; + char *out_ptr = data[2]; + npy_intp in1_stride = strides[0]; + npy_intp in2_stride = strides[1]; + npy_intp out_stride = strides[2]; + + QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; + QuadBackendType backend = descr->backend; + while (N--) { + quad_value in1 = *(quad_value *)in1_ptr; + quad_value in2 = *(quad_value *)in2_ptr; + + npy_bool result; + + if (backend == BACKEND_SLEEF) { + result = sleef_comp(&in1.sleef_value, &in2.sleef_value); + } + else { + result = ld_comp(&in1.longdouble_value, &in2.longdouble_value); + } + + *(npy_bool *)out_ptr = result; + + in1_ptr += in1_stride; + in2_ptr += in2_stride; + out_ptr += out_stride; + } + return 0; +} + +NPY_NO_EXPORT int +comparison_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[], + PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[]) +{ + PyArray_DTypeMeta *new_signature[NPY_MAXARGS]; + memcpy(new_signature, signature, 3 * sizeof(PyArray_DTypeMeta *)); + new_signature[2] = NULL; + int res = quad_ufunc_promoter(ufunc, op_dtypes, new_signature, new_op_dtypes); + if (res < 0) { + return -1; + } + Py_XSETREF(new_op_dtypes[2], &PyArray_BoolDType); + return 0; +} + +template +int +create_quad_comparison_ufunc(PyObject *numpy, const char *ufunc_name) +{ + PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name); + if (ufunc == NULL) { + return -1; + } + + PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &PyArray_BoolDType}; + + PyType_Slot slots[] = { + {NPY_METH_resolve_descriptors, (void *)&quad_comparison_op_resolve_descriptors}, + {NPY_METH_strided_loop, + (void *)&quad_generic_comp_strided_loop_aligned}, + {NPY_METH_unaligned_strided_loop, + (void *)&quad_generic_comp_strided_loop}, + {0, NULL}}; + + PyArrayMethod_Spec Spec = { + .name = "quad_comp", + .nin = 2, + .nout = 1, + .casting = NPY_SAFE_CASTING, + .flags = NPY_METH_SUPPORTS_UNALIGNED, + .dtypes = dtypes, + .slots = slots, + }; + + if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) { + return -1; + } + + PyObject *promoter_capsule = + PyCapsule_New((void *)&comparison_ufunc_promoter, "numpy._ufunc_promoter", NULL); + if (promoter_capsule == NULL) { + return -1; + } + + PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArray_BoolDType); + if (DTypes == 0) { + Py_DECREF(promoter_capsule); + return -1; + } + + if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) { + Py_DECREF(promoter_capsule); + Py_DECREF(DTypes); + return -1; + } + Py_DECREF(promoter_capsule); + Py_DECREF(DTypes); + + return 0; +} + +int +init_quad_comps(PyObject *numpy) +{ + if (create_quad_comparison_ufunc(numpy, "equal") < 0) { + return -1; + } + if (create_quad_comparison_ufunc(numpy, "not_equal") < 0) { + return -1; + } + if (create_quad_comparison_ufunc(numpy, "less") < 0) { + return -1; + } + if (create_quad_comparison_ufunc(numpy, "less_equal") < 0) { + return -1; + } + if (create_quad_comparison_ufunc(numpy, "greater") < 0) { + return -1; + } + if (create_quad_comparison_ufunc(numpy, "greater_equal") < + 0) { + return -1; + } + + return 0; +} \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath/comparison_ops.h b/quaddtype/numpy_quaddtype/src/umath/comparison_ops.h new file mode 100644 index 00000000..e3b8cc0e --- /dev/null +++ b/quaddtype/numpy_quaddtype/src/umath/comparison_ops.h @@ -0,0 +1,9 @@ +#ifndef _QUADDTYPE_COMPARISON_OPS_H +#define _QUADDTYPE_COMPARISON_OPS_H + +#include + +int +init_quad_comps(PyObject *numpy); + +#endif \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath/promoters.hpp b/quaddtype/numpy_quaddtype/src/umath/promoters.hpp new file mode 100644 index 00000000..3b3c1ef3 --- /dev/null +++ b/quaddtype/numpy_quaddtype/src/umath/promoters.hpp @@ -0,0 +1,90 @@ +#ifndef _QUADDTYPE_PROMOTERS +#define _QUADDTYPE_PROMOTERS + +#include +#include +#include +#include "numpy/arrayobject.h" +#include "numpy/ndarrayobject.h" +#include "numpy/ufuncobject.h" +#include "numpy/dtype_api.h" + +#include "../dtype.h" + +inline int +quad_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[], + PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[]) +{ + int nin = ufunc->nin; + int nargs = ufunc->nargs; + PyArray_DTypeMeta *common = NULL; + bool has_quad = false; + + // Handle the special case for reductions + if (op_dtypes[0] == NULL) { + assert(nin == 2 && ufunc->nout == 1); /* must be reduction */ + for (int i = 0; i < 3; i++) { + Py_INCREF(op_dtypes[1]); + new_op_dtypes[i] = op_dtypes[1]; + } + return 0; + } + + // Check if any input or signature is QuadPrecision + for (int i = 0; i < nin; i++) { + if (op_dtypes[i] == &QuadPrecDType) { + has_quad = true; + } + } + + if (has_quad) { + common = &QuadPrecDType; + } + else { + for (int i = nin; i < nargs; i++) { + if (signature[i] != NULL) { + if (common == NULL) { + Py_INCREF(signature[i]); + common = signature[i]; + } + else if (common != signature[i]) { + Py_CLEAR(common); // Not homogeneous, unset common + break; + } + } + } + } + // If no common output dtype, use standard promotion for inputs + if (common == NULL) { + common = PyArray_PromoteDTypeSequence(nin, op_dtypes); + if (common == NULL) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) { + PyErr_Clear(); // Do not propagate normal promotion errors + } + + return -1; + } + } + + // Set all new_op_dtypes to the common dtype + for (int i = 0; i < nargs; i++) { + if (signature[i]) { + // If signature is specified for this argument, use it + Py_INCREF(signature[i]); + new_op_dtypes[i] = signature[i]; + } + else { + // Otherwise, use the common dtype + Py_INCREF(common); + + new_op_dtypes[i] = common; + } + } + + Py_XDECREF(common); + + return 0; +} + + +#endif \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath/umath.cpp b/quaddtype/numpy_quaddtype/src/umath/umath.cpp new file mode 100644 index 00000000..2ea864e7 --- /dev/null +++ b/quaddtype/numpy_quaddtype/src/umath/umath.cpp @@ -0,0 +1,115 @@ +#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API +#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API +#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION +#define NPY_TARGET_VERSION NPY_2_0_API_VERSION +#define NO_IMPORT_ARRAY +#define NO_IMPORT_UFUNC + +extern "C" { +#include +#include + +#include "numpy/arrayobject.h" +#include "numpy/ndarraytypes.h" +#include "numpy/ufuncobject.h" +#include "numpy/dtype_api.h" +} +#include "../quad_common.h" +#include "../scalar.h" +#include "../dtype.h" +#include "umath.h" +#include "../ops.hpp" +#include "unary_ops.h" +#include "binary_ops.h" +#include "comparison_ops.h" + +// helper debugging function +static const char * +get_dtype_name(PyArray_DTypeMeta *dtype) +{ + if (dtype == &QuadPrecDType) { + return "QuadPrecDType"; + } + else if (dtype == &PyArray_BoolDType) { + return "BoolDType"; + } + else if (dtype == &PyArray_ByteDType) { + return "ByteDType"; + } + else if (dtype == &PyArray_UByteDType) { + return "UByteDType"; + } + else if (dtype == &PyArray_ShortDType) { + return "ShortDType"; + } + else if (dtype == &PyArray_UShortDType) { + return "UShortDType"; + } + else if (dtype == &PyArray_IntDType) { + return "IntDType"; + } + else if (dtype == &PyArray_UIntDType) { + return "UIntDType"; + } + else if (dtype == &PyArray_LongDType) { + return "LongDType"; + } + else if (dtype == &PyArray_ULongDType) { + return "ULongDType"; + } + else if (dtype == &PyArray_LongLongDType) { + return "LongLongDType"; + } + else if (dtype == &PyArray_ULongLongDType) { + return "ULongLongDType"; + } + else if (dtype == &PyArray_FloatDType) { + return "FloatDType"; + } + else if (dtype == &PyArray_DoubleDType) { + return "DoubleDType"; + } + else if (dtype == &PyArray_LongDoubleDType) { + return "LongDoubleDType"; + } + else { + return "UnknownDType"; + } +} + +int +init_quad_umath(void) +{ + PyObject *numpy = PyImport_ImportModule("numpy"); + if (!numpy) { + PyErr_SetString(PyExc_ImportError, "Failed to import numpy module"); + return -1; + } + + if (init_quad_unary_ops(numpy) < 0) { + PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad unary operations"); + goto err; + } + + if (init_quad_binary_ops(numpy) < 0) { + PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad binary operations"); + goto err; + } + + if (init_quad_comps(numpy) < 0) { + PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad comparison operations"); + goto err; + } + + // if (init_quad_matmul(numpy) < 0) { + // PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad matrix multiplication operations"); + // goto err; + // } + + Py_DECREF(numpy); + return 0; + +err: + Py_DECREF(numpy); + return -1; +} \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath.h b/quaddtype/numpy_quaddtype/src/umath/umath.h similarity index 95% rename from quaddtype/numpy_quaddtype/src/umath.h rename to quaddtype/numpy_quaddtype/src/umath/umath.h index d64f26be..c9253ef7 100644 --- a/quaddtype/numpy_quaddtype/src/umath.h +++ b/quaddtype/numpy_quaddtype/src/umath/umath.h @@ -12,4 +12,4 @@ init_quad_umath(void); } #endif -#endif +#endif \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath/unary_ops.cpp b/quaddtype/numpy_quaddtype/src/umath/unary_ops.cpp new file mode 100644 index 00000000..4c8f31f0 --- /dev/null +++ b/quaddtype/numpy_quaddtype/src/umath/unary_ops.cpp @@ -0,0 +1,214 @@ +#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API +#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API +#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION +#define NPY_TARGET_VERSION NPY_2_0_API_VERSION +#define NO_IMPORT_ARRAY +#define NO_IMPORT_UFUNC + +extern "C" { +#include +#include + +#include "numpy/arrayobject.h" +#include "numpy/ndarraytypes.h" +#include "numpy/ufuncobject.h" + +#include "numpy/dtype_api.h" +} +#include "../quad_common.h" +#include "../scalar.h" +#include "../dtype.h" +#include "../ops.hpp" + +static NPY_CASTING +quad_unary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[], + PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[], + npy_intp *NPY_UNUSED(view_offset)) +{ + Py_INCREF(given_descrs[0]); + loop_descrs[0] = given_descrs[0]; + + if (given_descrs[1] == NULL) { + Py_INCREF(given_descrs[0]); + loop_descrs[1] = given_descrs[0]; + } + else { + Py_INCREF(given_descrs[1]); + loop_descrs[1] = given_descrs[1]; + } + + QuadPrecDTypeObject *descr_in = (QuadPrecDTypeObject *)given_descrs[0]; + QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)loop_descrs[1]; + + if (descr_in->backend != descr_out->backend) { + return NPY_UNSAFE_CASTING; + } + + return NPY_NO_CASTING; +} + +template +int +quad_generic_unary_op_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) +{ + npy_intp N = dimensions[0]; + char *in_ptr = data[0]; + char *out_ptr = data[1]; + npy_intp in_stride = strides[0]; + npy_intp out_stride = strides[1]; + + QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; + QuadBackendType backend = descr->backend; + size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double); + + quad_value in, out; + while (N--) { + memcpy(&in, in_ptr, elem_size); + if (backend == BACKEND_SLEEF) { + out.sleef_value = sleef_op(&in.sleef_value); + } + else { + out.longdouble_value = longdouble_op(&in.longdouble_value); + } + memcpy(out_ptr, &out, elem_size); + + in_ptr += in_stride; + out_ptr += out_stride; + } + return 0; +} + +template +int +quad_generic_unary_op_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) +{ + npy_intp N = dimensions[0]; + char *in_ptr = data[0]; + char *out_ptr = data[1]; + npy_intp in_stride = strides[0]; + npy_intp out_stride = strides[1]; + + QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; + QuadBackendType backend = descr->backend; + + while (N--) { + if (backend == BACKEND_SLEEF) { + *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in_ptr); + } + else { + *(long double *)out_ptr = longdouble_op((long double *)in_ptr); + } + in_ptr += in_stride; + out_ptr += out_stride; + } + return 0; +} + +template +int +create_quad_unary_ufunc(PyObject *numpy, const char *ufunc_name) +{ + PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name); + if (ufunc == NULL) { + return -1; + } + + PyArray_DTypeMeta *dtypes[2] = {&QuadPrecDType, &QuadPrecDType}; + + PyType_Slot slots[] = { + {NPY_METH_resolve_descriptors, (void *)&quad_unary_op_resolve_descriptors}, + {NPY_METH_strided_loop, + (void *)&quad_generic_unary_op_strided_loop_aligned}, + {NPY_METH_unaligned_strided_loop, + (void *)&quad_generic_unary_op_strided_loop_unaligned}, + {0, NULL}}; + + PyArrayMethod_Spec Spec = { + .name = "quad_unary_op", + .nin = 1, + .nout = 1, + .casting = NPY_NO_CASTING, + .flags = NPY_METH_SUPPORTS_UNALIGNED, + .dtypes = dtypes, + .slots = slots, + }; + + if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) { + return -1; + } + + return 0; +} + +int +init_quad_unary_ops(PyObject *numpy) +{ + if (create_quad_unary_ufunc(numpy, "negative") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "positive") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "absolute") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "rint") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "trunc") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "floor") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "ceil") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "sqrt") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "square") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "log") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "log2") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "log10") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "log1p") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "exp") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "exp2") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "sin") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "cos") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "tan") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "arcsin") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "arccos") < 0) { + return -1; + } + if (create_quad_unary_ufunc(numpy, "arctan") < 0) { + return -1; + } + return 0; +} \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath/unary_ops.h b/quaddtype/numpy_quaddtype/src/umath/unary_ops.h new file mode 100644 index 00000000..6c3f17a4 --- /dev/null +++ b/quaddtype/numpy_quaddtype/src/umath/unary_ops.h @@ -0,0 +1,9 @@ +#ifndef _QUADDTYPE_UNARY_OPS_H +#define _QUADDTYPE_UNARY_OPS_H + +#include + +int +init_quad_unary_ops(PyObject *numpy); + +#endif From a35abce6cdcfbcdf9b660c41b21c5ce9be9c8c15 Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Wed, 16 Jul 2025 12:13:56 +0000 Subject: [PATCH 02/20] updaing ci --- .github/workflows/build_wheels.yml | 2 +- .github/workflows/ci.yml | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 37a80387..e16e2ff4 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -3,7 +3,7 @@ name: Build Wheels on: push: branches: - - main + - matmul-ufunc tags: - "quaddtype-v*" paths: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9267d9f4..0b4f2199 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,7 +3,7 @@ name: Numpy User DTypes CI on: push: branches: - - main + - matmul-ufunc pull_request: workflow_dispatch: @@ -61,15 +61,21 @@ jobs: sudo apt-get install -y libmpfr-dev libssl-dev libfftw3-dev - name: Install SLEEF run: | + yum update -y + yum install -y cmake gcc gcc-c++ make git pkgconfig git clone --branch 3.8 https://github.com/shibatch/sleef.git cd sleef cmake -S . -B build -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON cmake --build build/ --clean-first -j - sudo cmake --install build --prefix /usr + sudo cmake --install build --prefix /usr/local - name: Install quaddtype working-directory: quaddtype run: | - LDFLAGS="-Wl,-rpath,/usr/lib" python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug" + CFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CFLAGS" + CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -fext-numeric-literals $CXXFLAGS" + LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp $LDFLAGS" + LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH" + python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug" - name: Run quaddtype tests working-directory: quaddtype run: | From 851654472afaff41c5b4a56701c1ea8de644b522 Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Wed, 16 Jul 2025 12:16:54 +0000 Subject: [PATCH 03/20] switching to apt --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0b4f2199..7a1b7e9c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,8 +61,8 @@ jobs: sudo apt-get install -y libmpfr-dev libssl-dev libfftw3-dev - name: Install SLEEF run: | - yum update -y - yum install -y cmake gcc gcc-c++ make git pkgconfig + sudo apt-get update -y + sudo apt-get install -y cmake gcc g++ make git pkg-config git clone --branch 3.8 https://github.com/shibatch/sleef.git cd sleef cmake -S . -B build -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON From 335f42534d2ec92130cb03a5d5dfa1916eab5045 Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Wed, 16 Jul 2025 12:22:01 +0000 Subject: [PATCH 04/20] submodule fix --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7a1b7e9c..d2aa43fb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -75,6 +75,9 @@ jobs: CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -fext-numeric-literals $CXXFLAGS" LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp $LDFLAGS" LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH" + + git submodule update --init --recursive + ls -la quaddtype/numpy_quaddtype/QBLAS/ python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug" - name: Run quaddtype tests working-directory: quaddtype From 85e7840aab3737622203d0b62100df69d05b175e Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Wed, 16 Jul 2025 12:26:37 +0000 Subject: [PATCH 05/20] submodule fix --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d2aa43fb..ff64875f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -77,7 +77,7 @@ jobs: LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH" git submodule update --init --recursive - ls -la quaddtype/numpy_quaddtype/QBLAS/ + ls -la numpy_quaddtype/QBLAS/ python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug" - name: Run quaddtype tests working-directory: quaddtype From e467f4b01c09c45d5bd0f3cefc80cc64c45b4772 Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Wed, 16 Jul 2025 12:32:42 +0000 Subject: [PATCH 06/20] submodule fix --- .github/workflows/ci.yml | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ff64875f..a70e8c10 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,6 +59,7 @@ jobs: run: | sudo apt-get update sudo apt-get install -y libmpfr-dev libssl-dev libfftw3-dev + - name: Install SLEEF run: | sudo apt-get update -y @@ -68,17 +69,27 @@ jobs: cmake -S . -B build -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON cmake --build build/ --clean-first -j sudo cmake --install build --prefix /usr/local + - name: Install quaddtype working-directory: quaddtype run: | - CFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CFLAGS" - CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -fext-numeric-literals $CXXFLAGS" - LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp $LDFLAGS" - LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH" - - git submodule update --init --recursive - ls -la numpy_quaddtype/QBLAS/ - python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug" + # Initialize submodules first + git submodule update --init --recursive + ls -la numpy_quaddtype/QBLAS/ + + # Set environment variables with proper export and correct paths + export CFLAGS="-I/usr/local/include -I$(pwd)/numpy_quaddtype/QBLAS/include" + export CXXFLAGS="-I/usr/local/include -I$(pwd)/numpy_quaddtype/QBLAS/include -fext-numeric-literals" + export LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp" + export LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH" + + # Install with meson args to ensure the C++ flags are passed through + python -m pip install . -v --no-build-isolation \ + -Cbuilddir=build \ + -C'compile-args=-v' \ + -Csetup-args="-Dbuildtype=debug" \ + -Csetup-args="-Dcpp_args=-fext-numeric-literals" + - name: Run quaddtype tests working-directory: quaddtype run: | From e201b90f7d5f299ab41cf04d92d9045d5da9ae21 Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Thu, 17 Jul 2025 07:16:29 +0000 Subject: [PATCH 07/20] initial matmul ufunc setup --- quaddtype/meson.build | 2 + .../src/quadblas_interface.cpp | 91 +++++++++++ .../numpy_quaddtype/src/umath/matmul.cpp | 148 ++++++++++++++++++ quaddtype/numpy_quaddtype/src/umath/matmul.h | 8 + quaddtype/numpy_quaddtype/src/umath/umath.cpp | 9 +- 5 files changed, 254 insertions(+), 4 deletions(-) create mode 100644 quaddtype/numpy_quaddtype/src/umath/matmul.cpp create mode 100644 quaddtype/numpy_quaddtype/src/umath/matmul.h diff --git a/quaddtype/meson.build b/quaddtype/meson.build index 66318d60..c000675a 100644 --- a/quaddtype/meson.build +++ b/quaddtype/meson.build @@ -63,6 +63,8 @@ srcs = [ 'numpy_quaddtype/src/umath/comparison_ops.h', 'numpy_quaddtype/src/umath/comparison_ops.cpp', 'numpy_quaddtype/src/umath/promoters.hpp', + 'numpy_quaddtype/src/umath/matmul.h', + 'numpy_quaddtype/src/umath/matmul.cpp', ] py.install_sources( diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp index 6bc3fb0d..cce39c39 100644 --- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp +++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp @@ -786,4 +786,95 @@ py_quadblas_get_version(PyObject *self, PyObject *args) return PyUnicode_FromString(QuadBLAS::VERSION); } +void matmul_op(Sleef_quad * inp1, Sleef_quad *inp2, Sleef_quad *out) +{ + Sleef_quad *data_a, *data_b; + QuadBackendType backend_a, backend_b; + QuadBLAS::Layout layout_a, layout_b; + + if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) || + !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) { + return nullptr; + } + + Sleef_quad *temp_a = nullptr, *temp_b = nullptr; + Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a); + Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b); + + if (!sleef_a || !sleef_b) { + QuadBLAS::aligned_free(temp_a); + QuadBLAS::aligned_free(temp_b); + return nullptr; + } + + QuadBackendType result_backend = BACKEND_SLEEF; + if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) { + result_backend = BACKEND_LONGDOUBLE; + } + + npy_intp result_dims[2] = {m, n}; + QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend); + if (!result_dtype) { + QuadBLAS::aligned_free(temp_a); + QuadBLAS::aligned_free(temp_b); + return nullptr; + } + + PyArrayObject *result = + (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0); + if (!result) { + QuadBLAS::aligned_free(temp_a); + QuadBLAS::aligned_free(temp_b); + Py_DECREF(result_dtype); + return nullptr; + } + + Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result); + for (npy_intp i = 0; i < m * n; i++) { + result_data[i] = Sleef_cast_from_doubleq1(0.0); + } + + npy_intp lda, ldb, ldc; + + if (layout_a == QuadBLAS::Layout::RowMajor) { + lda = k; + } + else { + lda = m; + } + + if (layout_b == QuadBLAS::Layout::RowMajor) { + ldb = n; + } + else { + ldb = k; + } + + QuadBLAS::Layout result_layout = layout_a; + if (result_layout == QuadBLAS::Layout::RowMajor) { + ldc = n; + } + else { + ldc = m; + } + + Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0); + Sleef_quad beta = Sleef_cast_from_doubleq1(0.0); + + QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data, + ldc); + + if (result_backend == BACKEND_LONGDOUBLE) { + long double *ld_result = (long double *)PyArray_DATA(result); + for (npy_intp i = 0; i < m * n; i++) { + ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]); + } + } + + QuadBLAS::aligned_free(temp_a); + QuadBLAS::aligned_free(temp_b); + + return (PyObject *)result; +} + #endif // DISABLE_QUADBLAS \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp new file mode 100644 index 00000000..47fdf5f5 --- /dev/null +++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp @@ -0,0 +1,148 @@ +#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API +#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API +#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION +#define NPY_TARGET_VERSION NPY_2_0_API_VERSION +#define NO_IMPORT_ARRAY +#define NO_IMPORT_UFUNC + + +#include +#include + +#include "numpy/arrayobject.h" +#include "numpy/ufuncobject.h" +#include "numpy/dtype_api.h" +#include "numpy/ndarraytypes.h" + +#include "../quad_common.h" +#include "../scalar.h" +#include "../dtype.h" +#include "../ops.hpp" +#include "binary_ops.h" +#include "matmul.h" + +#include + +static NPY_CASTING +quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[], + PyArray_Descr *const given_descrs[], + PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset)) +{ + + NPY_CASTING casting = NPY_NO_CASTING; + std::cout << "exiting the descriptor"; + return casting; +} + +template +int +quad_generic_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) +{ + npy_intp N = dimensions[0]; + char *in1_ptr = data[0], *in2_ptr = data[1]; + char *out_ptr = data[2]; + npy_intp in1_stride = strides[0]; + npy_intp in2_stride = strides[1]; + npy_intp out_stride = strides[2]; + + QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; + QuadBackendType backend = descr->backend; + size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double); + + quad_value in1, in2, out; + while (N--) { + memcpy(&in1, in1_ptr, elem_size); + memcpy(&in2, in2_ptr, elem_size); + if (backend == BACKEND_SLEEF) { + out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value); + } + else { + out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value); + } + memcpy(out_ptr, &out, elem_size); + + in1_ptr += in1_stride; + in2_ptr += in2_stride; + out_ptr += out_stride; + } + return 0; +} + +template +int +quad_generic_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) +{ + npy_intp N = dimensions[0]; + char *in1_ptr = data[0], *in2_ptr = data[1]; + char *out_ptr = data[2]; + npy_intp in1_stride = strides[0]; + npy_intp in2_stride = strides[1]; + npy_intp out_stride = strides[2]; + + QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; + QuadBackendType backend = descr->backend; + + while (N--) { + if (backend == BACKEND_SLEEF) { + *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr); + } + else { + *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr); + } + + in1_ptr += in1_stride; + in2_ptr += in2_stride; + out_ptr += out_stride; + } + return 0; +} + +int +create_matmul_ufunc(PyObject *numpy, const char *ufunc_name) +{ + PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name); + if (ufunc == NULL) { + return -1; + } + + PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType}; + + PyType_Slot slots[] = { + {NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors}, + {NPY_METH_strided_loop, + (void *)&quad_generic_matmul_strided_loop_aligned}, + {NPY_METH_unaligned_strided_loop, + (void *)&quad_generic_matmul_strided_loop_unaligned}, + {0, NULL}}; + + PyArrayMethod_Spec Spec = { + .name = "quad_matmul", + .nin = 2, + .nout = 1, + .casting = NPY_NO_CASTING, + .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE), + .dtypes = dtypes, + .slots = slots, + }; + + if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) { + return -1; + } + // my guess we don't need any promoter here as of now, since matmul is quad specific + return 0; +} + + +int +init_matmul_ops(PyObject *numpy) +{ + if (create_matmul_ufunc(numpy, "matmul") < 0) { + return -1; + } + return 0; +} + diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.h b/quaddtype/numpy_quaddtype/src/umath/matmul.h new file mode 100644 index 00000000..bc099eba --- /dev/null +++ b/quaddtype/numpy_quaddtype/src/umath/matmul.h @@ -0,0 +1,8 @@ +#ifndef _QUADDTYPE_MATMUL_OPS_H +#define _QUADDTYPE_MATMUL_OPS_H + +#include + +int +init_matmul_ops(PyObject *numpy); +#endif \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath/umath.cpp b/quaddtype/numpy_quaddtype/src/umath/umath.cpp index 2ea864e7..50f95623 100644 --- a/quaddtype/numpy_quaddtype/src/umath/umath.cpp +++ b/quaddtype/numpy_quaddtype/src/umath/umath.cpp @@ -22,6 +22,7 @@ extern "C" { #include "unary_ops.h" #include "binary_ops.h" #include "comparison_ops.h" +#include "matmul.h" // helper debugging function static const char * @@ -101,10 +102,10 @@ init_quad_umath(void) goto err; } - // if (init_quad_matmul(numpy) < 0) { - // PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad matrix multiplication operations"); - // goto err; - // } + if (init_matmul_ops(numpy) < 0) { + PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad matrix multiplication operations"); + goto err; + } Py_DECREF(numpy); return 0; From 09918a310859e7879b3d2e2120e87a86e0ca0605 Mon Sep 17 00:00:00 2001 From: swayaminsync Date: Thu, 17 Jul 2025 15:50:04 +0530 Subject: [PATCH 08/20] mid-way test --- quaddtype/numpy_quaddtype/QBLAS | 2 +- .../src/quadblas_interface.cpp | 180 +++++++++--------- .../numpy_quaddtype/src/umath/matmul.cpp | 25 ++- quaddtype/release_tracker.md | 93 +++++++++ 4 files changed, 195 insertions(+), 105 deletions(-) create mode 100644 quaddtype/release_tracker.md diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS index 0eabb677..4853ac1c 160000 --- a/quaddtype/numpy_quaddtype/QBLAS +++ b/quaddtype/numpy_quaddtype/QBLAS @@ -1 +1 @@ -Subproject commit 0eabb677431c6148434c50deba7abd6902d74b16 +Subproject commit 4853ac1c7d3fa3016b61e9f2b9a43f49c06d891d diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp index cce39c39..46f953de 100644 --- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp +++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp @@ -786,95 +786,95 @@ py_quadblas_get_version(PyObject *self, PyObject *args) return PyUnicode_FromString(QuadBLAS::VERSION); } -void matmul_op(Sleef_quad * inp1, Sleef_quad *inp2, Sleef_quad *out) -{ - Sleef_quad *data_a, *data_b; - QuadBackendType backend_a, backend_b; - QuadBLAS::Layout layout_a, layout_b; - - if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) || - !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) { - return nullptr; - } - - Sleef_quad *temp_a = nullptr, *temp_b = nullptr; - Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a); - Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b); - - if (!sleef_a || !sleef_b) { - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - return nullptr; - } - - QuadBackendType result_backend = BACKEND_SLEEF; - if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) { - result_backend = BACKEND_LONGDOUBLE; - } - - npy_intp result_dims[2] = {m, n}; - QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend); - if (!result_dtype) { - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - return nullptr; - } - - PyArrayObject *result = - (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0); - if (!result) { - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - Py_DECREF(result_dtype); - return nullptr; - } - - Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result); - for (npy_intp i = 0; i < m * n; i++) { - result_data[i] = Sleef_cast_from_doubleq1(0.0); - } - - npy_intp lda, ldb, ldc; - - if (layout_a == QuadBLAS::Layout::RowMajor) { - lda = k; - } - else { - lda = m; - } - - if (layout_b == QuadBLAS::Layout::RowMajor) { - ldb = n; - } - else { - ldb = k; - } - - QuadBLAS::Layout result_layout = layout_a; - if (result_layout == QuadBLAS::Layout::RowMajor) { - ldc = n; - } - else { - ldc = m; - } - - Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0); - Sleef_quad beta = Sleef_cast_from_doubleq1(0.0); - - QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data, - ldc); - - if (result_backend == BACKEND_LONGDOUBLE) { - long double *ld_result = (long double *)PyArray_DATA(result); - for (npy_intp i = 0; i < m * n; i++) { - ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]); - } - } - - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - - return (PyObject *)result; -} +// void matmul_op(Sleef_quad * inp1, Sleef_quad *inp2, Sleef_quad *out) +// { +// Sleef_quad *data_a, *data_b; +// QuadBackendType backend_a, backend_b; +// QuadBLAS::Layout layout_a, layout_b; + +// if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) || +// !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) { +// return nullptr; +// } + +// Sleef_quad *temp_a = nullptr, *temp_b = nullptr; +// Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a); +// Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b); + +// if (!sleef_a || !sleef_b) { +// QuadBLAS::aligned_free(temp_a); +// QuadBLAS::aligned_free(temp_b); +// return nullptr; +// } + +// QuadBackendType result_backend = BACKEND_SLEEF; +// if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) { +// result_backend = BACKEND_LONGDOUBLE; +// } + +// npy_intp result_dims[2] = {m, n}; +// QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend); +// if (!result_dtype) { +// QuadBLAS::aligned_free(temp_a); +// QuadBLAS::aligned_free(temp_b); +// return nullptr; +// } + +// PyArrayObject *result = +// (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0); +// if (!result) { +// QuadBLAS::aligned_free(temp_a); +// QuadBLAS::aligned_free(temp_b); +// Py_DECREF(result_dtype); +// return nullptr; +// } + +// Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result); +// for (npy_intp i = 0; i < m * n; i++) { +// result_data[i] = Sleef_cast_from_doubleq1(0.0); +// } + +// npy_intp lda, ldb, ldc; + +// if (layout_a == QuadBLAS::Layout::RowMajor) { +// lda = k; +// } +// else { +// lda = m; +// } + +// if (layout_b == QuadBLAS::Layout::RowMajor) { +// ldb = n; +// } +// else { +// ldb = k; +// } + +// QuadBLAS::Layout result_layout = layout_a; +// if (result_layout == QuadBLAS::Layout::RowMajor) { +// ldc = n; +// } +// else { +// ldc = m; +// } + +// Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0); +// Sleef_quad beta = Sleef_cast_from_doubleq1(0.0); + +// QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data, +// ldc); + +// if (result_backend == BACKEND_LONGDOUBLE) { +// long double *ld_result = (long double *)PyArray_DATA(result); +// for (npy_intp i = 0; i < m * n; i++) { +// ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]); +// } +// } + +// QuadBLAS::aligned_free(temp_a); +// QuadBLAS::aligned_free(temp_b); + +// return (PyObject *)result; +// } #endif // DISABLE_QUADBLAS \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp index 47fdf5f5..01b8935d 100644 --- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp +++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp @@ -5,7 +5,6 @@ #define NO_IMPORT_ARRAY #define NO_IMPORT_UFUNC - #include #include @@ -25,20 +24,19 @@ static NPY_CASTING quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[], - PyArray_Descr *const given_descrs[], - PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset)) + PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[], + npy_intp *NPY_UNUSED(view_offset)) { - - NPY_CASTING casting = NPY_NO_CASTING; - std::cout << "exiting the descriptor"; - return casting; + NPY_CASTING casting = NPY_NO_CASTING; + std::cout << "exiting the descriptor"; + return casting; } template int quad_generic_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[], - npy_intp const dimensions[], npy_intp const strides[], - NpyAuxData *auxdata) + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) { npy_intp N = dimensions[0]; char *in1_ptr = data[0], *in2_ptr = data[1]; @@ -73,8 +71,8 @@ quad_generic_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char template int quad_generic_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[], - npy_intp const dimensions[], npy_intp const strides[], - NpyAuxData *auxdata) + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) { npy_intp N = dimensions[0]; char *in1_ptr = data[0], *in2_ptr = data[1]; @@ -101,6 +99,7 @@ quad_generic_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *c return 0; } +template int create_matmul_ufunc(PyObject *numpy, const char *ufunc_name) { @@ -136,13 +135,11 @@ create_matmul_ufunc(PyObject *numpy, const char *ufunc_name) return 0; } - int init_matmul_ops(PyObject *numpy) { - if (create_matmul_ufunc(numpy, "matmul") < 0) { + if (create_matmul_ufunc(numpy, "matmul") < 0) { return -1; } return 0; } - diff --git a/quaddtype/release_tracker.md b/quaddtype/release_tracker.md new file mode 100644 index 00000000..1ecf7d3a --- /dev/null +++ b/quaddtype/release_tracker.md @@ -0,0 +1,93 @@ +# Plan for `numpy-quaddtype` v1.5 + +| ufunc name | Added | +| ------------- | ----- | +| add | ✅ | +| subtract | ✅ | +| multiply | ✅ | +| matmul | #116 | +| divide | ✅ | +| logaddexp | | +| logaddexp2 | | +| true_divide | | +| floor_divide | | +| negative | ✅ | +| positive | ✅ | +| power | ✅ | +| float_power | | +| remainder | | +| mod | ✅ | +| fmod | | +| divmod | | +| absolute | ✅ | +| fabs | | +| rint | ✅ | +| sign | | +| heaviside | | +| conj | | +| conjugate | | +| exp | ✅ | +| exp2 | ✅ | +| log | ✅ | +| log2 | ✅ | +| log10 | ✅ | +| expm1 | | +| log1p | ✅ | +| sqrt | ✅ | +| square | ✅ | +| cbrt | | +| reciprocal | | +| gcd | | +| lcm | | +| sin | ✅ | +| cos | ✅ | +| tan | ✅ | +| arcsin | ✅ | +| arccos | ✅ | +| arctan | ✅ | +| arctan2 | ✅ | +| hypot | | +| sinh | | +| cosh | | +| tanh | | +| arcsinh | | +| arccosh | | +| arctanh | | +| degrees | | +| radians | | +| deg2rad | | +| rad2deg | | +| bitwise_and | | +| bitwise_or | | +| bitwise_xor | | +| invert | | +| left_shift | | +| right_shift | | +| greater | ✅ | +| greater_equal | ✅ | +| less | ✅ | +| less_equal | ✅ | +| not_equal | ✅ | +| equal | ✅ | +| logical_and | | +| logical_or | | +| logical_xor | | +| logical_not | | +| maximum | ✅ | +| minimum | ✅ | +| fmax | | +| fmin | | +| isfinite | | +| isinf | | +| isnan | | +| isnat | | +| signbit | | +| copysign | | +| nextafter | | +| spacing | | +| modf | | +| ldexp | | +| frexp | | +| floor | ✅ | +| ceil | ✅ | +| trunc | ✅ | From 70ca6446bf8146759a0ca21710f0a908ff3e1eb9 Mon Sep 17 00:00:00 2001 From: swayaminsync Date: Thu, 17 Jul 2025 19:47:32 +0530 Subject: [PATCH 09/20] shifting to matmul ufunc --- quaddtype/numpy_quaddtype/__init__.py | 3 +- .../numpy_quaddtype/src/quaddtype_main.c | 69 ++++++++++--------- quaddtype/tests/test_dot.py | 67 ++++++++---------- 3 files changed, 66 insertions(+), 73 deletions(-) diff --git a/quaddtype/numpy_quaddtype/__init__.py b/quaddtype/numpy_quaddtype/__init__.py index b0a9f3be..8b588c11 100644 --- a/quaddtype/numpy_quaddtype/__init__.py +++ b/quaddtype/numpy_quaddtype/__init__.py @@ -3,7 +3,6 @@ QuadPrecDType, is_longdouble_128, get_sleef_constant, - qblas_dot as dot, set_num_threads, get_num_threads, get_quadblas_version @@ -17,7 +16,7 @@ # Constants 'pi', 'e', 'log2e', 'log10e', 'ln2', 'ln10', 'max_value', 'min_value', 'epsilon', # QuadBLAS related functions - 'dot', 'set_num_threads', 'get_num_threads', 'get_quadblas_version' + 'set_num_threads', 'get_num_threads', 'get_quadblas_version' ] def SleefQuadPrecision(value): diff --git a/quaddtype/numpy_quaddtype/src/quaddtype_main.c b/quaddtype/numpy_quaddtype/src/quaddtype_main.c index 641200d3..1e8fd535 100644 --- a/quaddtype/numpy_quaddtype/src/quaddtype_main.c +++ b/quaddtype/numpy_quaddtype/src/quaddtype_main.c @@ -19,45 +19,55 @@ #include "quadblas_interface.h" #include "float.h" - -static PyObject* py_is_longdouble_128(PyObject* self, PyObject* args) { - if(sizeof(long double) == 16 && - LDBL_MANT_DIG == 113 && - LDBL_MAX_EXP == 16384) { +static PyObject * +py_is_longdouble_128(PyObject *self, PyObject *args) +{ + if (sizeof(long double) == 16 && LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384) { Py_RETURN_TRUE; - } else { + } + else { Py_RETURN_FALSE; } } -static PyObject* get_sleef_constant(PyObject* self, PyObject* args) { - const char* constant_name; +static PyObject * +get_sleef_constant(PyObject *self, PyObject *args) +{ + const char *constant_name; if (!PyArg_ParseTuple(args, "s", &constant_name)) { return NULL; } - QuadPrecisionObject* result = QuadPrecision_raw_new(BACKEND_SLEEF); + QuadPrecisionObject *result = QuadPrecision_raw_new(BACKEND_SLEEF); if (result == NULL) { return NULL; } if (strcmp(constant_name, "pi") == 0) { result->value.sleef_value = SLEEF_M_PIq; - } else if (strcmp(constant_name, "e") == 0) { + } + else if (strcmp(constant_name, "e") == 0) { result->value.sleef_value = SLEEF_M_Eq; - } else if (strcmp(constant_name, "log2e") == 0) { + } + else if (strcmp(constant_name, "log2e") == 0) { result->value.sleef_value = SLEEF_M_LOG2Eq; - } else if (strcmp(constant_name, "log10e") == 0) { + } + else if (strcmp(constant_name, "log10e") == 0) { result->value.sleef_value = SLEEF_M_LOG10Eq; - } else if (strcmp(constant_name, "ln2") == 0) { + } + else if (strcmp(constant_name, "ln2") == 0) { result->value.sleef_value = SLEEF_M_LN2q; - } else if (strcmp(constant_name, "ln10") == 0) { + } + else if (strcmp(constant_name, "ln10") == 0) { result->value.sleef_value = SLEEF_M_LN10q; - } else if (strcmp(constant_name, "quad_max") == 0) { + } + else if (strcmp(constant_name, "quad_max") == 0) { result->value.sleef_value = SLEEF_QUAD_MAX; - } else if (strcmp(constant_name, "quad_min") == 0) { + } + else if (strcmp(constant_name, "quad_min") == 0) { result->value.sleef_value = SLEEF_QUAD_MIN; - } else if (strcmp(constant_name, "epsilon") == 0) { + } + else if (strcmp(constant_name, "epsilon") == 0) { result->value.sleef_value = SLEEF_QUAD_EPSILON; } else { @@ -66,26 +76,23 @@ static PyObject* get_sleef_constant(PyObject* self, PyObject* args) { return NULL; } - return (PyObject*)result; + return (PyObject *)result; } static PyMethodDef module_methods[] = { - {"is_longdouble_128", py_is_longdouble_128, METH_NOARGS, "Check if long double is 128-bit"}, - {"get_sleef_constant", get_sleef_constant, METH_VARARGS, "Get Sleef constant by name"}, - {"qblas_dot", py_quadblas_dot, METH_VARARGS, "Optimized dot product using QuadBLAS"}, - {"set_num_threads", py_quadblas_set_num_threads, METH_VARARGS, "Set number of threads for QuadBLAS"}, - {"get_num_threads", py_quadblas_get_num_threads, METH_NOARGS, "Get number of threads for QuadBLAS"}, - {"get_quadblas_version", py_quadblas_get_version, METH_NOARGS, "Get QuadBLAS version"}, - {NULL, NULL, 0, NULL} -}; + {"is_longdouble_128", py_is_longdouble_128, METH_NOARGS, "Check if long double is 128-bit"}, + {"get_sleef_constant", get_sleef_constant, METH_VARARGS, "Get Sleef constant by name"}, + {"set_num_threads", py_quadblas_set_num_threads, METH_VARARGS, + "Set number of threads for QuadBLAS"}, + {"get_num_threads", py_quadblas_get_num_threads, METH_NOARGS, + "Get number of threads for QuadBLAS"}, + {"get_quadblas_version", py_quadblas_get_version, METH_NOARGS, "Get QuadBLAS version"}, + {NULL, NULL, 0, NULL}}; static struct PyModuleDef moduledef = { - PyModuleDef_HEAD_INIT, - .m_name = "_quaddtype_main", + PyModuleDef_HEAD_INIT, .m_name = "_quaddtype_main", .m_doc = "Quad (128-bit) floating point Data Type for NumPy with multiple backends", - .m_size = -1, - .m_methods = module_methods -}; + .m_size = -1, .m_methods = module_methods}; PyMODINIT_FUNC PyInit__quaddtype_main(void) diff --git a/quaddtype/tests/test_dot.py b/quaddtype/tests/test_dot.py index ed135f47..f3fa3f62 100644 --- a/quaddtype/tests/test_dot.py +++ b/quaddtype/tests/test_dot.py @@ -1,19 +1,6 @@ -""" -Focused test suite for the dot function in numpy_quaddtype - -This module tests the QuadBLAS dot function for: -- Vector-vector dot products -- Matrix-vector multiplication -- Matrix-matrix multiplication -- Small and large matrix operations -- Basic correctness validation - -Uses only the Sleef backend for simplicity. -""" - import pytest import numpy as np -from numpy_quaddtype import QuadPrecision, QuadPrecDType, dot +from numpy_quaddtype import QuadPrecision, QuadPrecDType # ================================================================================ @@ -81,14 +68,14 @@ def create_quad_array(values, shape=None): # ================================================================================ class TestVectorVectorDot: - """Test vector-vector dot products""" + """Test vector-vector np.matmul products""" def test_simple_dot_product(self): - """Test basic vector dot product""" + """Test basic vector np.matmul product""" x = create_quad_array([1, 2, 3]) y = create_quad_array([4, 5, 6]) - result = dot(x, y) + result = np.matmul(x, y) expected = 1*4 + 2*5 + 3*6 # = 32 assert isinstance(result, QuadPrecision) @@ -99,14 +86,14 @@ def test_orthogonal_vectors(self): x = create_quad_array([1, 0, 0]) y = create_quad_array([0, 1, 0]) - result = dot(x, y) + result = np.matmul(x, y) assert_quad_equal(result, 0.0) def test_same_vector(self): - """Test dot product of vector with itself""" + """Test np.matmul product of vector with itself""" x = create_quad_array([2, 3, 4]) - result = dot(x, x) + result = np.matmul(x, x) expected = 2*2 + 3*3 + 4*4 # = 29 assert_quad_equal(result, expected) @@ -121,7 +108,7 @@ def test_various_vector_sizes(self, size): x = create_quad_array(x_vals) y = create_quad_array(y_vals) - result = dot(x, y) + result = np.matmul(x, y) expected = sum(x_vals[i] * y_vals[i] for i in range(size)) assert_quad_equal(result, expected) @@ -131,7 +118,7 @@ def test_negative_and_fractional_values(self): x = create_quad_array([1.5, -2.5, 3.25]) y = create_quad_array([-1.25, 2.75, -3.5]) - result = dot(x, y) + result = np.matmul(x, y) expected = 1.5*(-1.25) + (-2.5)*2.75 + 3.25*(-3.5) assert_quad_equal(result, expected) @@ -151,7 +138,7 @@ def test_simple_matrix_vector(self): # 3x1 vector x = create_quad_array([1, 1, 1]) - result = dot(A, x) + result = np.matmul(A, x) expected = [1+2+3, 4+5+6] # [6, 15] assert result.shape == (2,) @@ -164,7 +151,7 @@ def test_identity_matrix_vector(self): I = create_quad_array([1, 0, 0, 0, 1, 0, 0, 0, 1], shape=(3, 3)) x = create_quad_array([2, 3, 4]) - result = dot(I, x) + result = np.matmul(I, x) assert result.shape == (3,) for i in range(3): @@ -181,7 +168,7 @@ def test_various_matrix_vector_sizes(self, m, n): x_vals = [i + 1 for i in range(n)] x = create_quad_array(x_vals) - result = dot(A, x) + result = np.matmul(A, x) assert result.shape == (m,) @@ -205,7 +192,7 @@ def test_simple_matrix_matrix(self): A = create_quad_array([1, 2, 3, 4], shape=(2, 2)) B = create_quad_array([5, 6, 7, 8], shape=(2, 2)) - result = dot(A, B) + result = np.matmul(A, B) # Expected: [[1*5+2*7, 1*6+2*8], [3*5+4*7, 3*6+4*8]] = [[19, 22], [43, 50]] expected = [[19, 22], [43, 50]] @@ -221,11 +208,11 @@ def test_identity_matrix_multiplication(self): I = create_quad_array([1, 0, 0, 1], shape=(2, 2)) # A * I should equal A - result1 = dot(A, I) + result1 = np.matmul(A, I) assert_quad_array_equal(result1, A) # I * A should equal A - result2 = dot(I, A) + result2 = np.matmul(I, A) assert_quad_array_equal(result2, A) @pytest.mark.parametrize("m,n,k", [(2,2,2), (2,3,4), (3,2,5), (4,4,4), (5,6,7)]) @@ -239,7 +226,7 @@ def test_various_matrix_sizes(self, m, n, k): B_vals = [(i*n + j + 1) for i in range(k) for j in range(n)] B = create_quad_array(B_vals, shape=(k, n)) - result = dot(A, B) + result = np.matmul(A, B) assert result.shape == (m, n) @@ -258,12 +245,12 @@ def test_associativity(self): C = create_quad_array([1, 1, 2, 1], shape=(2, 2)) # Compute (A*B)*C - AB = dot(A, B) - result1 = dot(AB, C) + AB = np.matmul(A, B) + result1 = np.matmul(AB, C) # Compute A*(B*C) - BC = dot(B, C) - result2 = dot(A, BC) + BC = np.matmul(B, C) + result2 = np.matmul(A, BC) assert_quad_array_equal(result1, result2, rtol=1e-25) @@ -285,7 +272,7 @@ def test_large_square_matrices(self, size): A = create_quad_array(A_vals, shape=(size, size)) B = create_quad_array(B_vals, shape=(size, size)) - result = dot(A, B) + result = np.matmul(A, B) assert result.shape == (size, size) @@ -303,7 +290,7 @@ def test_large_square_matrices(self, size): assert_quad_equal(result[size//2, size//2], expected_value, rtol=1e-15, atol=1e-15) def test_large_vector_operations(self): - """Test large vector dot products""" + """Test large vector np.matmul products""" size = 1000 # Create vectors with known sum @@ -313,7 +300,7 @@ def test_large_vector_operations(self): x = create_quad_array(x_vals) y = create_quad_array(y_vals) - result = dot(x, y) + result = np.matmul(x, y) expected = size * 1.0 * 2.0 # = 2000.0 assert_quad_equal(result, expected) @@ -329,7 +316,7 @@ def test_rectangular_large_matrices(self): A = create_quad_array(A_vals, shape=(m, k)) B = create_quad_array(B_vals, shape=(k, n)) - result = dot(A, B) + result = np.matmul(A, B) assert result.shape == (m, n) @@ -354,7 +341,7 @@ def test_dimension_mismatch_vectors(self): y = create_quad_array([1, 2, 3]) with pytest.raises(ValueError, match="same length"): - dot(x, y) + np.matmul(x, y) def test_dimension_mismatch_matrix_vector(self): """Test dimension mismatch in matrix-vector""" @@ -362,7 +349,7 @@ def test_dimension_mismatch_matrix_vector(self): x = create_quad_array([1, 2, 3]) # Wrong size with pytest.raises(ValueError, match="columns must match"): - dot(A, x) + np.matmul(A, x) def test_dimension_mismatch_matrices(self): """Test dimension mismatch in matrix-matrix""" @@ -370,7 +357,7 @@ def test_dimension_mismatch_matrices(self): B = create_quad_array([1, 2, 3, 4, 5, 6], shape=(3, 2)) # Wrong size with pytest.raises(ValueError, match="Matrix inner dimensions must match"): - dot(A, B) + np.matmul(A, B) if __name__ == "__main__": From f89c2e6a1bc66433a3c3dd8c3cd0e9a101b400e2 Mon Sep 17 00:00:00 2001 From: swayaminsync Date: Fri, 18 Jul 2025 01:22:58 +0530 Subject: [PATCH 10/20] will figure out later --- .github/workflows/build_wheels.yml | 2 +- .github/workflows/ci.yml | 6 +- .../src/quadblas_interface.cpp | 91 ------------------- 3 files changed, 4 insertions(+), 95 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index e16e2ff4..0bf55c46 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -3,7 +3,7 @@ name: Build Wheels on: push: branches: - - matmul-ufunc + - dot tags: - "quaddtype-v*" paths: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a70e8c10..e42c12c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,7 +3,7 @@ name: Numpy User DTypes CI on: push: branches: - - matmul-ufunc + - dot pull_request: workflow_dispatch: @@ -76,13 +76,13 @@ jobs: # Initialize submodules first git submodule update --init --recursive ls -la numpy_quaddtype/QBLAS/ - + # Set environment variables with proper export and correct paths export CFLAGS="-I/usr/local/include -I$(pwd)/numpy_quaddtype/QBLAS/include" export CXXFLAGS="-I/usr/local/include -I$(pwd)/numpy_quaddtype/QBLAS/include -fext-numeric-literals" export LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp" export LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH" - + # Install with meson args to ensure the C++ flags are passed through python -m pip install . -v --no-build-isolation \ -Cbuilddir=build \ diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp index 46f953de..6bc3fb0d 100644 --- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp +++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp @@ -786,95 +786,4 @@ py_quadblas_get_version(PyObject *self, PyObject *args) return PyUnicode_FromString(QuadBLAS::VERSION); } -// void matmul_op(Sleef_quad * inp1, Sleef_quad *inp2, Sleef_quad *out) -// { -// Sleef_quad *data_a, *data_b; -// QuadBackendType backend_a, backend_b; -// QuadBLAS::Layout layout_a, layout_b; - -// if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) || -// !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) { -// return nullptr; -// } - -// Sleef_quad *temp_a = nullptr, *temp_b = nullptr; -// Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a); -// Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b); - -// if (!sleef_a || !sleef_b) { -// QuadBLAS::aligned_free(temp_a); -// QuadBLAS::aligned_free(temp_b); -// return nullptr; -// } - -// QuadBackendType result_backend = BACKEND_SLEEF; -// if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) { -// result_backend = BACKEND_LONGDOUBLE; -// } - -// npy_intp result_dims[2] = {m, n}; -// QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend); -// if (!result_dtype) { -// QuadBLAS::aligned_free(temp_a); -// QuadBLAS::aligned_free(temp_b); -// return nullptr; -// } - -// PyArrayObject *result = -// (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0); -// if (!result) { -// QuadBLAS::aligned_free(temp_a); -// QuadBLAS::aligned_free(temp_b); -// Py_DECREF(result_dtype); -// return nullptr; -// } - -// Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result); -// for (npy_intp i = 0; i < m * n; i++) { -// result_data[i] = Sleef_cast_from_doubleq1(0.0); -// } - -// npy_intp lda, ldb, ldc; - -// if (layout_a == QuadBLAS::Layout::RowMajor) { -// lda = k; -// } -// else { -// lda = m; -// } - -// if (layout_b == QuadBLAS::Layout::RowMajor) { -// ldb = n; -// } -// else { -// ldb = k; -// } - -// QuadBLAS::Layout result_layout = layout_a; -// if (result_layout == QuadBLAS::Layout::RowMajor) { -// ldc = n; -// } -// else { -// ldc = m; -// } - -// Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0); -// Sleef_quad beta = Sleef_cast_from_doubleq1(0.0); - -// QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data, -// ldc); - -// if (result_backend == BACKEND_LONGDOUBLE) { -// long double *ld_result = (long double *)PyArray_DATA(result); -// for (npy_intp i = 0; i < m * n; i++) { -// ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]); -// } -// } - -// QuadBLAS::aligned_free(temp_a); -// QuadBLAS::aligned_free(temp_b); - -// return (PyObject *)result; -// } - #endif // DISABLE_QUADBLAS \ No newline at end of file From 894a84db72cf4f9ec3b0367ae669102cb61eecda Mon Sep 17 00:00:00 2001 From: swayaminsync Date: Sat, 19 Jul 2025 13:46:32 +0530 Subject: [PATCH 11/20] matmul registered with naive --- .../numpy_quaddtype/src/umath/matmul.cpp | 264 +++++++++++++----- quaddtype/numpy_quaddtype/src/umath/matmul.h | 38 ++- quaddtype/tests/test_dot.py | 6 +- 3 files changed, 225 insertions(+), 83 deletions(-) diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp index 01b8935d..00cc8589 100644 --- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp +++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp @@ -5,141 +5,251 @@ #define NO_IMPORT_ARRAY #define NO_IMPORT_UFUNC +extern "C" { #include #include +#include #include "numpy/arrayobject.h" +#include "numpy/ndarraytypes.h" #include "numpy/ufuncobject.h" #include "numpy/dtype_api.h" -#include "numpy/ndarraytypes.h" +} #include "../quad_common.h" #include "../scalar.h" #include "../dtype.h" #include "../ops.hpp" -#include "binary_ops.h" #include "matmul.h" +#include "promoters.hpp" -#include - +/** + * Resolve descriptors for matmul operation. + * Follows the same pattern as binary_ops.cpp + */ static NPY_CASTING quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[], PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset)) { - NPY_CASTING casting = NPY_NO_CASTING; - std::cout << "exiting the descriptor"; - return casting; -} + // Follow the exact same pattern as quad_binary_op_resolve_descriptors + QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0]; + QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1]; + QuadBackendType target_backend; -template -int -quad_generic_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[], - npy_intp const dimensions[], npy_intp const strides[], - NpyAuxData *auxdata) -{ - npy_intp N = dimensions[0]; - char *in1_ptr = data[0], *in2_ptr = data[1]; - char *out_ptr = data[2]; - npy_intp in1_stride = strides[0]; - npy_intp in2_stride = strides[1]; - npy_intp out_stride = strides[2]; - - QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; - QuadBackendType backend = descr->backend; - size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double); + // Determine target backend and if casting is needed + NPY_CASTING casting = NPY_NO_CASTING; + if (descr_in1->backend != descr_in2->backend) { + target_backend = BACKEND_LONGDOUBLE; + casting = NPY_SAFE_CASTING; + } + else { + target_backend = descr_in1->backend; + } - quad_value in1, in2, out; - while (N--) { - memcpy(&in1, in1_ptr, elem_size); - memcpy(&in2, in2_ptr, elem_size); - if (backend == BACKEND_SLEEF) { - out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value); + // Set up input descriptors, casting if necessary + for (int i = 0; i < 2; i++) { + if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) { + loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend); + if (!loop_descrs[i]) { + return (NPY_CASTING)-1; + } } else { - out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value); + Py_INCREF(given_descrs[i]); + loop_descrs[i] = given_descrs[i]; } - memcpy(out_ptr, &out, elem_size); + } - in1_ptr += in1_stride; - in2_ptr += in2_stride; - out_ptr += out_stride; + // Set up output descriptor + if (given_descrs[2] == NULL) { + loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend); + if (!loop_descrs[2]) { + return (NPY_CASTING)-1; + } } - return 0; + else { + QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2]; + if (descr_out->backend != target_backend) { + loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend); + if (!loop_descrs[2]) { + return (NPY_CASTING)-1; + } + } + else { + Py_INCREF(given_descrs[2]); + loop_descrs[2] = given_descrs[2]; + } + } + return casting; } -template -int -quad_generic_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[], - npy_intp const dimensions[], npy_intp const strides[], - NpyAuxData *auxdata) +/** + * Matrix multiplication strided loop using NumPy 2.0 API. + * Implements general matrix multiplication for arbitrary dimensions. + * + * For matmul with signature (m?,n),(n,p?)->(m?,p?): + * - dimensions[0] = N (loop dimension, number of batch operations) + * - dimensions[1] = m (rows of first matrix) + * - dimensions[2] = n (cols of first matrix / rows of second matrix) + * - dimensions[3] = p (cols of second matrix) + * + * - strides[0], strides[1], strides[2] = batch strides for A, B, C + * - strides[3], strides[4] = row stride, col stride for A (m, n) + * - strides[5], strides[6] = row stride, col stride for B (n, p) + * - strides[7], strides[8] = row stride, col stride for C (m, p) + */ +static int +quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata) { - npy_intp N = dimensions[0]; - char *in1_ptr = data[0], *in2_ptr = data[1]; - char *out_ptr = data[2]; - npy_intp in1_stride = strides[0]; - npy_intp in2_stride = strides[1]; - npy_intp out_stride = strides[2]; - + // Extract dimensions + npy_intp N = dimensions[0]; // Number of batch operations + npy_intp m = dimensions[1]; // Rows of first matrix + npy_intp n = dimensions[2]; // Cols of first matrix / rows of second matrix + npy_intp p = dimensions[3]; // Cols of second matrix + + // Extract batch strides + npy_intp A_batch_stride = strides[0]; + npy_intp B_batch_stride = strides[1]; + npy_intp C_batch_stride = strides[2]; + + // Extract core strides for matrix dimensions + npy_intp A_row_stride = strides[3]; // Stride along m dimension of A + npy_intp A_col_stride = strides[4]; // Stride along n dimension of A + npy_intp B_row_stride = strides[5]; // Stride along n dimension of B + npy_intp B_col_stride = strides[6]; // Stride along p dimension of B + npy_intp C_row_stride = strides[7]; // Stride along m dimension of C + npy_intp C_col_stride = strides[8]; // Stride along p dimension of C + + // Get backend from descriptor QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; QuadBackendType backend = descr->backend; + size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double); - while (N--) { - if (backend == BACKEND_SLEEF) { - *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr); - } - else { - *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr); + // Process each batch + for (npy_intp batch = 0; batch < N; batch++) { + char *A_batch = data[0] + batch * A_batch_stride; + char *B_batch = data[1] + batch * B_batch_stride; + char *C_batch = data[2] + batch * C_batch_stride; + + // Perform matrix multiplication: C = A @ B + // C[i,j] = sum_k(A[i,k] * B[k,j]) + for (npy_intp i = 0; i < m; i++) { + for (npy_intp j = 0; j < p; j++) { + char *C_ij = C_batch + i * C_row_stride + j * C_col_stride; + + if (backend == BACKEND_SLEEF) { + Sleef_quad sum = Sleef_cast_from_doubleq1(0.0); // Initialize to 0 + + for (npy_intp k = 0; k < n; k++) { + char *A_ik = A_batch + i * A_row_stride + k * A_col_stride; + char *B_kj = B_batch + k * B_row_stride + j * B_col_stride; + + Sleef_quad a_val, b_val; + memcpy(&a_val, A_ik, sizeof(Sleef_quad)); + memcpy(&b_val, B_kj, sizeof(Sleef_quad)); + + // sum += A[i,k] * B[k,j] + sum = Sleef_addq1_u05(sum, Sleef_mulq1_u05(a_val, b_val)); + } + + memcpy(C_ij, &sum, sizeof(Sleef_quad)); + } + else { + // Long double backend + long double sum = 0.0L; + + for (npy_intp k = 0; k < n; k++) { + char *A_ik = A_batch + i * A_row_stride + k * A_col_stride; + char *B_kj = B_batch + k * B_row_stride + j * B_col_stride; + + long double a_val, b_val; + memcpy(&a_val, A_ik, sizeof(long double)); + memcpy(&b_val, B_kj, sizeof(long double)); + + sum += a_val * b_val; + } + + memcpy(C_ij, &sum, sizeof(long double)); + } + } } - - in1_ptr += in1_stride; - in2_ptr += in2_stride; - out_ptr += out_stride; } + return 0; } -template +/** + * Register matmul support following the exact same pattern as binary_ops.cpp + */ int -create_matmul_ufunc(PyObject *numpy, const char *ufunc_name) +init_matmul_ops(PyObject *numpy) { - PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name); + printf("DEBUG: init_matmul_ops - registering matmul using NumPy 2.0 API\n"); + + // Get the existing matmul ufunc - same pattern as binary_ops + PyObject *ufunc = PyObject_GetAttrString(numpy, "matmul"); if (ufunc == NULL) { + printf("DEBUG: Failed to get numpy.matmul\n"); return -1; } + // Use the same pattern as binary_ops.cpp PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType}; - PyType_Slot slots[] = { - {NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors}, - {NPY_METH_strided_loop, - (void *)&quad_generic_matmul_strided_loop_aligned}, - {NPY_METH_unaligned_strided_loop, - (void *)&quad_generic_matmul_strided_loop_unaligned}, - {0, NULL}}; + PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors}, + {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop}, + {NPY_METH_unaligned_strided_loop, (void *)&quad_matmul_strided_loop}, + {0, NULL}}; PyArrayMethod_Spec Spec = { .name = "quad_matmul", .nin = 2, .nout = 1, .casting = NPY_NO_CASTING, - .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE), + .flags = NPY_METH_SUPPORTS_UNALIGNED, .dtypes = dtypes, .slots = slots, }; + printf("DEBUG: About to add loop to matmul ufunc...\n"); + if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) { + printf("DEBUG: Failed to add loop to matmul ufunc\n"); + Py_DECREF(ufunc); return -1; } - // my guess we don't need any promoter here as of now, since matmul is quad specific - return 0; -} -int -init_matmul_ops(PyObject *numpy) -{ - if (create_matmul_ufunc(numpy, "matmul") < 0) { + printf("DEBUG: Successfully added matmul loop!\n"); + + // Add promoter following binary_ops pattern + PyObject *promoter_capsule = + PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL); + if (promoter_capsule == NULL) { + Py_DECREF(ufunc); + return -1; + } + + PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type); + if (DTypes == NULL) { + Py_DECREF(promoter_capsule); + Py_DECREF(ufunc); return -1; } + + if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) { + printf("DEBUG: Failed to add promoter (continuing anyway)\n"); + PyErr_Clear(); // Don't fail if promoter fails + } + else { + printf("DEBUG: Successfully added promoter\n"); + } + + Py_DECREF(DTypes); + Py_DECREF(promoter_capsule); + Py_DECREF(ufunc); + + printf("DEBUG: init_matmul_ops completed successfully\n"); return 0; -} +} \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.h b/quaddtype/numpy_quaddtype/src/umath/matmul.h index bc099eba..947e2c3d 100644 --- a/quaddtype/numpy_quaddtype/src/umath/matmul.h +++ b/quaddtype/numpy_quaddtype/src/umath/matmul.h @@ -1,8 +1,40 @@ -#ifndef _QUADDTYPE_MATMUL_OPS_H -#define _QUADDTYPE_MATMUL_OPS_H +#ifndef _QUADDTYPE_MATMUL_H +#define _QUADDTYPE_MATMUL_H + +/** + * Quad Precision Matrix Multiplication for NumPy + * + * This module implements matrix multiplication functionality for the QuadPrecDType + * by registering custom loops with numpy's matmul generalized ufunc. + * + * Supports all matmul operation types: + * - Vector-vector (dot product): (n,) @ (n,) -> scalar + * - Matrix-vector: (m,n) @ (n,) -> (m,) + * - Vector-matrix: (n,) @ (n,p) -> (p,) + * - Matrix-matrix: (m,n) @ (n,p) -> (m,p) + * + * Uses naive algorithms optimized for correctness rather than performance. + * For production use, consider integration with QBLAS optimized routines. + */ #include +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the matmul operations for the quad precision dtype. + * This function registers the matmul generalized ufunc with numpy. + * + * @param numpy The numpy module object + * @return 0 on success, -1 on failure + */ int init_matmul_ops(PyObject *numpy); -#endif \ No newline at end of file + +#ifdef __cplusplus +} +#endif + +#endif // _QUADDTYPE_MATMUL_H \ No newline at end of file diff --git a/quaddtype/tests/test_dot.py b/quaddtype/tests/test_dot.py index f3fa3f62..9256f3d8 100644 --- a/quaddtype/tests/test_dot.py +++ b/quaddtype/tests/test_dot.py @@ -340,7 +340,7 @@ def test_dimension_mismatch_vectors(self): x = create_quad_array([1, 2]) y = create_quad_array([1, 2, 3]) - with pytest.raises(ValueError, match="same length"): + with pytest.raises(ValueError, match=r"matmul: Input operand 1 has a mismatch in its core dimension 0"): np.matmul(x, y) def test_dimension_mismatch_matrix_vector(self): @@ -348,7 +348,7 @@ def test_dimension_mismatch_matrix_vector(self): A = create_quad_array([1, 2, 3, 4], shape=(2, 2)) x = create_quad_array([1, 2, 3]) # Wrong size - with pytest.raises(ValueError, match="columns must match"): + with pytest.raises(ValueError, match=r"matmul: Input operand 1 has a mismatch in its core dimension 0"): np.matmul(A, x) def test_dimension_mismatch_matrices(self): @@ -356,7 +356,7 @@ def test_dimension_mismatch_matrices(self): A = create_quad_array([1, 2, 3, 4], shape=(2, 2)) B = create_quad_array([1, 2, 3, 4, 5, 6], shape=(3, 2)) # Wrong size - with pytest.raises(ValueError, match="Matrix inner dimensions must match"): + with pytest.raises(ValueError, match=r"matmul: Input operand 1 has a mismatch in its core dimension 0"): np.matmul(A, B) From 6800a906b1fbcdd363f02482c5b6f225463cc456 Mon Sep 17 00:00:00 2001 From: swayaminsync Date: Sat, 19 Jul 2025 16:29:09 +0530 Subject: [PATCH 12/20] adding initial qblas support to matmul ufunc, something is breaking, nan --- .../src/quadblas_interface.cpp | 806 ++---------------- .../numpy_quaddtype/src/quadblas_interface.h | 34 +- .../numpy_quaddtype/src/umath/matmul.cpp | 263 +++--- 3 files changed, 257 insertions(+), 846 deletions(-) diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp index 6bc3fb0d..185e2d87 100644 --- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp +++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp @@ -1,773 +1,126 @@ -#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API -#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API -#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION -#define NPY_TARGET_VERSION NPY_2_0_API_VERSION -#define NO_IMPORT_ARRAY -#define NO_IMPORT_UFUNC - -extern "C" { -#include -#include "numpy/arrayobject.h" -#include "numpy/ndarraytypes.h" -#include "numpy/dtype_api.h" -} - -#include "scalar.h" -#include "dtype.h" -#include "quad_common.h" #include "quadblas_interface.h" - -extern "C" { -#include -#include -} - -#ifndef DISABLE_QUADBLAS #include "../QBLAS/include/quadblas/quadblas.hpp" -#endif - -#ifdef DISABLE_QUADBLAS - -static bool -extract_quad_array_info_simple(PyArrayObject *arr, Sleef_quad **data, QuadBackendType *backend) -{ - if (!PyArray_Check(arr)) { - PyErr_SetString(PyExc_TypeError, "Expected numpy array"); - return false; - } - - PyArray_Descr *descr = PyArray_DESCR(arr); - if (!PyObject_TypeCheck(descr, (PyTypeObject *)&QuadPrecDType)) { - PyErr_SetString(PyExc_TypeError, "Array must have QuadPrecDType dtype"); - return false; - } - - QuadPrecDTypeObject *quad_descr = (QuadPrecDTypeObject *)descr; - *backend = quad_descr->backend; - *data = (Sleef_quad *)PyArray_DATA(arr); - - return true; -} - -static Sleef_quad * -ensure_sleef_backend_simple(PyArrayObject *arr, QuadBackendType original_backend, - Sleef_quad **temp_storage) -{ - if (original_backend == BACKEND_SLEEF) { - *temp_storage = nullptr; - return (Sleef_quad *)PyArray_DATA(arr); - } - - npy_intp size = PyArray_SIZE(arr); - *temp_storage = (Sleef_quad *)malloc(size * sizeof(Sleef_quad)); - if (!*temp_storage) { - PyErr_NoMemory(); - return nullptr; - } - - long double *ld_data = (long double *)PyArray_DATA(arr); - for (npy_intp i = 0; i < size; i++) { - (*temp_storage)[i] = Sleef_cast_from_doubleq1((double)ld_data[i]); - } +#include +#include - return *temp_storage; -} - -// =============================================================================== -// FALLBACK IMPLEMENTATIONS (No QuadBLAS) -// =============================================================================== +extern "C" { -static PyObject * -dot_vector_vector_fallback(PyArrayObject *a, PyArrayObject *b) +int +qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result) { - if (PyArray_NDIM(a) != 1 || PyArray_NDIM(b) != 1) { - PyErr_SetString(PyExc_ValueError, "Both inputs must be 1-dimensional arrays"); - return nullptr; - } - - npy_intp n_a = PyArray_DIM(a, 0); - npy_intp n_b = PyArray_DIM(b, 0); - - if (n_a != n_b) { - PyErr_SetString(PyExc_ValueError, "Arrays must have the same length"); - return nullptr; - } - - Sleef_quad *data_a, *data_b; - QuadBackendType backend_a, backend_b; - - if (!extract_quad_array_info_simple(a, &data_a, &backend_a) || - !extract_quad_array_info_simple(b, &data_b, &backend_b)) { - return nullptr; - } - - Sleef_quad *temp_a = nullptr, *temp_b = nullptr; - Sleef_quad *sleef_a = ensure_sleef_backend_simple(a, backend_a, &temp_a); - Sleef_quad *sleef_b = ensure_sleef_backend_simple(b, backend_b, &temp_b); - - if (!sleef_a || !sleef_b) { - free(temp_a); - free(temp_b); - return nullptr; - } - - // Simple dot product implementation - Sleef_quad result = Sleef_cast_from_doubleq1(0.0); - for (npy_intp i = 0; i < n_a; i++) { - result = Sleef_fmaq1_u05(sleef_a[i], sleef_b[i], result); - } - - free(temp_a); - free(temp_b); - - QuadBackendType result_backend = BACKEND_SLEEF; - if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) { - result_backend = BACKEND_LONGDOUBLE; - } - - QuadPrecisionObject *result_obj = QuadPrecision_raw_new(result_backend); - if (!result_obj) { - return nullptr; + if (!x || !y || !result || n == 0) { + return -1; } - if (result_backend == BACKEND_SLEEF) { - result_obj->value.sleef_value = result; + try { + *result = QuadBLAS::dot(n, x, incx, y, incy); + return 0; } - else { - result_obj->value.longdouble_value = (long double)Sleef_cast_to_doubleq1(result); + catch (...) { + return -1; } - - return (PyObject *)result_obj; } -static PyObject * -dot_matrix_vector_fallback(PyArrayObject *a, PyArrayObject *b) +int +qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A, + size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy) { - if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 1) { - PyErr_SetString(PyExc_ValueError, "First input must be 2D, second input must be 1D"); - return nullptr; - } - - npy_intp m = PyArray_DIM(a, 0); - npy_intp n = PyArray_DIM(a, 1); - npy_intp n_b = PyArray_DIM(b, 0); - - if (n != n_b) { - PyErr_SetString(PyExc_ValueError, "Matrix columns must match vector length"); - return nullptr; - } - - Sleef_quad *data_a, *data_b; - QuadBackendType backend_a, backend_b; - - if (!extract_quad_array_info_simple(a, &data_a, &backend_a) || - !extract_quad_array_info_simple(b, &data_b, &backend_b)) { - return nullptr; - } - - Sleef_quad *temp_a = nullptr, *temp_b = nullptr; - Sleef_quad *sleef_a = ensure_sleef_backend_simple(a, backend_a, &temp_a); - Sleef_quad *sleef_b = ensure_sleef_backend_simple(b, backend_b, &temp_b); - - if (!sleef_a || !sleef_b) { - free(temp_a); - free(temp_b); - return nullptr; - } - - QuadBackendType result_backend = BACKEND_SLEEF; - if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) { - result_backend = BACKEND_LONGDOUBLE; - } - - npy_intp result_dims[1] = {m}; - QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend); - if (!result_dtype) { - free(temp_a); - free(temp_b); - return nullptr; - } - - PyArrayObject *result = - (PyArrayObject *)PyArray_Empty(1, result_dims, (PyArray_Descr *)result_dtype, 0); - if (!result) { - free(temp_a); - free(temp_b); - Py_DECREF(result_dtype); - return nullptr; - } - - Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result); - - // Initialize result to zero - for (npy_intp i = 0; i < m; i++) { - result_data[i] = Sleef_cast_from_doubleq1(0.0); + if (!alpha || !A || !x || !beta || !y || m == 0 || n == 0) { + return -1; } - // Simple matrix-vector multiplication: result[i] = sum(A[i,j] * b[j]) - for (npy_intp i = 0; i < m; i++) { - Sleef_quad sum = Sleef_cast_from_doubleq1(0.0); - for (npy_intp j = 0; j < n; j++) { - // Assume row-major layout: A[i,j] = sleef_a[i*n + j] - sum = Sleef_fmaq1_u05(sleef_a[i * n + j], sleef_b[j], sum); + try { + // Convert layout + QuadBLAS::Layout qblas_layout; + if (layout == 'R' || layout == 'r') { + qblas_layout = QuadBLAS::Layout::RowMajor; } - result_data[i] = sum; - } - - // Convert to longdouble if needed - if (result_backend == BACKEND_LONGDOUBLE) { - long double *ld_result = (long double *)PyArray_DATA(result); - for (npy_intp i = 0; i < m; i++) { - ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]); + else if (layout == 'C' || layout == 'c') { + qblas_layout = QuadBLAS::Layout::ColMajor; } - } - - free(temp_a); - free(temp_b); - - return (PyObject *)result; -} - -static PyObject * -dot_matrix_matrix_fallback(PyArrayObject *a, PyArrayObject *b) -{ - if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 2) { - PyErr_SetString(PyExc_ValueError, "Both inputs must be 2-dimensional arrays"); - return nullptr; - } - - npy_intp m = PyArray_DIM(a, 0); - npy_intp k = PyArray_DIM(a, 1); - npy_intp k_b = PyArray_DIM(b, 0); - npy_intp n = PyArray_DIM(b, 1); - - if (k != k_b) { - PyErr_SetString(PyExc_ValueError, "Matrix inner dimensions must match"); - return nullptr; - } - - Sleef_quad *data_a, *data_b; - QuadBackendType backend_a, backend_b; - - if (!extract_quad_array_info_simple(a, &data_a, &backend_a) || - !extract_quad_array_info_simple(b, &data_b, &backend_b)) { - return nullptr; - } - - Sleef_quad *temp_a = nullptr, *temp_b = nullptr; - Sleef_quad *sleef_a = ensure_sleef_backend_simple(a, backend_a, &temp_a); - Sleef_quad *sleef_b = ensure_sleef_backend_simple(b, backend_b, &temp_b); - - if (!sleef_a || !sleef_b) { - free(temp_a); - free(temp_b); - return nullptr; - } - - QuadBackendType result_backend = BACKEND_SLEEF; - if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) { - result_backend = BACKEND_LONGDOUBLE; - } - - npy_intp result_dims[2] = {m, n}; - QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend); - if (!result_dtype) { - free(temp_a); - free(temp_b); - return nullptr; - } - - PyArrayObject *result = - (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0); - if (!result) { - free(temp_a); - free(temp_b); - Py_DECREF(result_dtype); - return nullptr; - } - - Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result); - - // Initialize result matrix to zero - for (npy_intp i = 0; i < m * n; i++) { - result_data[i] = Sleef_cast_from_doubleq1(0.0); - } - - // Simple matrix-matrix multiplication: C[i,j] = sum(A[i,l] * B[l,j]) - for (npy_intp i = 0; i < m; i++) { - for (npy_intp j = 0; j < n; j++) { - Sleef_quad sum = Sleef_cast_from_doubleq1(0.0); - for (npy_intp l = 0; l < k; l++) { - // Row-major: A[i,l] = sleef_a[i*k + l], B[l,j] = sleef_b[l*n + j] - sum = Sleef_fmaq1_u05(sleef_a[i * k + l], sleef_b[l * n + j], sum); - } - result_data[i * n + j] = sum; + else { + return -1; // Invalid layout } - } - // Convert to longdouble if needed - if (result_backend == BACKEND_LONGDOUBLE) { - long double *ld_result = (long double *)PyArray_DATA(result); - for (npy_intp i = 0; i < m * n; i++) { - ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]); + // Handle transpose (swap dimensions for transpose) + size_t actual_m = m, actual_n = n; + if (trans == 'T' || trans == 't' || trans == 'C' || trans == 'c') { + std::swap(actual_m, actual_n); + // For transpose, we need to adjust the layout + if (qblas_layout == QuadBLAS::Layout::RowMajor) { + qblas_layout = QuadBLAS::Layout::ColMajor; + } + else { + qblas_layout = QuadBLAS::Layout::RowMajor; + } } - } - free(temp_a); - free(temp_b); + // Call QBLAS GEMV + QuadBLAS::gemv(qblas_layout, actual_m, actual_n, *alpha, A, lda, x, incx, *beta, y, incy); - return (PyObject *)result; -} - -PyObject * -py_quadblas_dot(PyObject *self, PyObject *args) -{ - PyObject *a_obj, *b_obj; - - if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) { - return nullptr; - } - - PyArrayObject *a = (PyArrayObject *)PyArray_FROM_OF(a_obj, NPY_ARRAY_ALIGNED); - PyArrayObject *b = (PyArrayObject *)PyArray_FROM_OF(b_obj, NPY_ARRAY_ALIGNED); - - if (!a || !b) { - Py_XDECREF(a); - Py_XDECREF(b); - PyErr_SetString(PyExc_TypeError, "Inputs must be convertible to arrays"); - return nullptr; - } - - PyObject *result = nullptr; - - int ndim_a = PyArray_NDIM(a); - int ndim_b = PyArray_NDIM(b); - - if (ndim_a == 1 && ndim_b == 1) { - result = dot_vector_vector_fallback(a, b); - } - else if (ndim_a == 2 && ndim_b == 1) { - result = dot_matrix_vector_fallback(a, b); + return 0; } - else if (ndim_a == 2 && ndim_b == 2) { - result = dot_matrix_matrix_fallback(a, b); + catch (...) { + return -1; } - else if (ndim_a == 1 && ndim_b == 2) { - PyErr_SetString(PyExc_ValueError, - "Vector-Matrix multiplication not supported (use Matrix-Vector instead)"); - } - else { - PyErr_SetString(PyExc_ValueError, - "Unsupported array dimensions. Supported: (1D,1D), (2D,1D), (2D,2D)"); - } - - Py_DECREF(a); - Py_DECREF(b); - - return result; } -// Dummy implementations for other QuadBLAS functions -PyObject * -py_quadblas_set_num_threads(PyObject *self, PyObject *args) -{ - // On Windows fallback, just ignore thread setting - Py_RETURN_NONE; -} - -PyObject * -py_quadblas_get_num_threads(PyObject *self, PyObject *args) +int +qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha, + Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C, + size_t ldc) { - // Return 1 for fallback implementation - return PyLong_FromLong(1); -} - -PyObject * -py_quadblas_get_version(PyObject *self, PyObject *args) -{ - return PyUnicode_FromString("QuadBLAS is disabled for MSVC"); -} - -#else - -static QuadBLAS::Layout -get_quadblas_layout(PyArrayObject *arr) -{ - if (PyArray_IS_C_CONTIGUOUS(arr)) { - return QuadBLAS::Layout::RowMajor; - } - else { - return QuadBLAS::Layout::ColMajor; - } -} - -static bool -extract_quad_array_info(PyArrayObject *arr, Sleef_quad **data, QuadBackendType *backend, - QuadBLAS::Layout *layout) -{ - if (!PyArray_Check(arr)) { - PyErr_SetString(PyExc_TypeError, "Expected numpy array"); - return false; - } - - PyArray_Descr *descr = PyArray_DESCR(arr); - if (!PyObject_TypeCheck(descr, (PyTypeObject *)&QuadPrecDType)) { - PyErr_SetString(PyExc_TypeError, "Array must have QuadPrecDType dtype"); - return false; - } - - QuadPrecDTypeObject *quad_descr = (QuadPrecDTypeObject *)descr; - *backend = quad_descr->backend; - *data = (Sleef_quad *)PyArray_DATA(arr); - *layout = get_quadblas_layout(arr); - - return true; -} - -static Sleef_quad * -ensure_sleef_backend(PyArrayObject *arr, QuadBackendType original_backend, - Sleef_quad **temp_storage) -{ - if (original_backend == BACKEND_SLEEF) { - *temp_storage = nullptr; - return (Sleef_quad *)PyArray_DATA(arr); - } - - npy_intp size = PyArray_SIZE(arr); - *temp_storage = QuadBLAS::aligned_alloc(size); - if (!*temp_storage) { - PyErr_NoMemory(); - return nullptr; - } - - long double *ld_data = (long double *)PyArray_DATA(arr); - for (npy_intp i = 0; i < size; i++) { - (*temp_storage)[i] = Sleef_cast_from_doubleq1((double)ld_data[i]); - } - - return *temp_storage; -} - -static PyObject * -dot_vector_vector(PyArrayObject *a, PyArrayObject *b) -{ - if (PyArray_NDIM(a) != 1 || PyArray_NDIM(b) != 1) { - PyErr_SetString(PyExc_ValueError, "Both inputs must be 1-dimensional arrays"); - return nullptr; - } - - npy_intp n_a = PyArray_DIM(a, 0); - npy_intp n_b = PyArray_DIM(b, 0); - - if (n_a != n_b) { - PyErr_SetString(PyExc_ValueError, "Arrays must have the same length"); - return nullptr; - } - - Sleef_quad *data_a, *data_b; - QuadBackendType backend_a, backend_b; - QuadBLAS::Layout layout_a, layout_b; - - if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) || - !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) { - return nullptr; - } - - Sleef_quad *temp_a = nullptr, *temp_b = nullptr; - Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a); - Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b); - - if (!sleef_a || !sleef_b) { - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - return nullptr; - } - - npy_intp stride_a = PyArray_STRIDE(a, 0) / PyArray_ITEMSIZE(a); - npy_intp stride_b = PyArray_STRIDE(b, 0) / PyArray_ITEMSIZE(b); - - Sleef_quad result = QuadBLAS::dot(n_a, sleef_a, stride_a, sleef_b, stride_b); - - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - - QuadBackendType result_backend = BACKEND_SLEEF; - if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) { - result_backend = BACKEND_LONGDOUBLE; - } - - QuadPrecisionObject *result_obj = QuadPrecision_raw_new(result_backend); - if (!result_obj) { - return nullptr; - } - - if (result_backend == BACKEND_SLEEF) { - result_obj->value.sleef_value = result; - } - else { - result_obj->value.longdouble_value = (long double)Sleef_cast_to_doubleq1(result); - } - - return (PyObject *)result_obj; -} - -static PyObject * -dot_matrix_vector(PyArrayObject *a, PyArrayObject *b) -{ - if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 1) { - PyErr_SetString(PyExc_ValueError, "First input must be 2D, second input must be 1D"); - return nullptr; - } - - npy_intp m = PyArray_DIM(a, 0); - npy_intp n = PyArray_DIM(a, 1); - npy_intp n_b = PyArray_DIM(b, 0); - - if (n != n_b) { - PyErr_SetString(PyExc_ValueError, "Matrix columns must match vector length"); - return nullptr; - } - - Sleef_quad *data_a, *data_b; - QuadBackendType backend_a, backend_b; - QuadBLAS::Layout layout_a, layout_b; - - if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) || - !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) { - return nullptr; - } - - Sleef_quad *temp_a = nullptr, *temp_b = nullptr; - Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a); - Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b); - - if (!sleef_a || !sleef_b) { - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - return nullptr; - } - - QuadBackendType result_backend = BACKEND_SLEEF; - if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) { - result_backend = BACKEND_LONGDOUBLE; - } - - npy_intp result_dims[1] = {m}; - QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend); - if (!result_dtype) { - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - return nullptr; - } - - PyArrayObject *result = - (PyArrayObject *)PyArray_Empty(1, result_dims, (PyArray_Descr *)result_dtype, 0); - if (!result) { - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - Py_DECREF(result_dtype); - return nullptr; + if (!alpha || !A || !B || !beta || !C || m == 0 || n == 0 || k == 0) { + return -1; } - Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result); - - npy_intp lda; - if (layout_a == QuadBLAS::Layout::RowMajor) { - lda = n; - } - else { - lda = m; - } - - npy_intp stride_b = PyArray_STRIDE(b, 0) / PyArray_ITEMSIZE(b); - npy_intp stride_result = PyArray_STRIDE(result, 0) / PyArray_ITEMSIZE(result); - - Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0); - Sleef_quad beta = Sleef_cast_from_doubleq1(0.0); - - QuadBLAS::gemv(layout_a, m, n, alpha, sleef_a, lda, sleef_b, stride_b, beta, result_data, - stride_result); - - if (result_backend == BACKEND_LONGDOUBLE) { - long double *ld_result = (long double *)PyArray_DATA(result); - for (npy_intp i = 0; i < m; i++) { - ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]); + try { + // Convert layout + QuadBLAS::Layout qblas_layout; + if (layout == 'R' || layout == 'r') { + qblas_layout = QuadBLAS::Layout::RowMajor; + } + else if (layout == 'C' || layout == 'c') { + qblas_layout = QuadBLAS::Layout::ColMajor; + } + else { + return -1; // Invalid layout } - } - - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - - return (PyObject *)result; -} - -static PyObject * -dot_matrix_matrix(PyArrayObject *a, PyArrayObject *b) -{ - if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 2) { - PyErr_SetString(PyExc_ValueError, "Both inputs must be 2-dimensional arrays"); - return nullptr; - } - - npy_intp m = PyArray_DIM(a, 0); - npy_intp k = PyArray_DIM(a, 1); - npy_intp k_b = PyArray_DIM(b, 0); - npy_intp n = PyArray_DIM(b, 1); - - if (k != k_b) { - PyErr_SetString(PyExc_ValueError, "Matrix inner dimensions must match"); - return nullptr; - } - - Sleef_quad *data_a, *data_b; - QuadBackendType backend_a, backend_b; - QuadBLAS::Layout layout_a, layout_b; - - if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) || - !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) { - return nullptr; - } - - Sleef_quad *temp_a = nullptr, *temp_b = nullptr; - Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a); - Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b); - - if (!sleef_a || !sleef_b) { - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - return nullptr; - } - - QuadBackendType result_backend = BACKEND_SLEEF; - if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) { - result_backend = BACKEND_LONGDOUBLE; - } - - npy_intp result_dims[2] = {m, n}; - QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend); - if (!result_dtype) { - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - return nullptr; - } - - PyArrayObject *result = - (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0); - if (!result) { - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - Py_DECREF(result_dtype); - return nullptr; - } - - Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result); - for (npy_intp i = 0; i < m * n; i++) { - result_data[i] = Sleef_cast_from_doubleq1(0.0); - } - - npy_intp lda, ldb, ldc; - if (layout_a == QuadBLAS::Layout::RowMajor) { - lda = k; - } - else { - lda = m; - } + // For now, we only support no transpose + // TODO: Implement transpose support if needed + if ((transa != 'N' && transa != 'n') || (transb != 'N' && transb != 'n')) { + return -1; // Transpose not implemented yet + } - if (layout_b == QuadBLAS::Layout::RowMajor) { - ldb = n; - } - else { - ldb = k; - } + // Call QBLAS GEMM + QuadBLAS::gemm(qblas_layout, m, n, k, *alpha, A, lda, B, ldb, *beta, C, ldc); - QuadBLAS::Layout result_layout = layout_a; - if (result_layout == QuadBLAS::Layout::RowMajor) { - ldc = n; - } - else { - ldc = m; + return 0; } - - Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0); - Sleef_quad beta = Sleef_cast_from_doubleq1(0.0); - - QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data, - ldc); - - if (result_backend == BACKEND_LONGDOUBLE) { - long double *ld_result = (long double *)PyArray_DATA(result); - for (npy_intp i = 0; i < m * n; i++) { - ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]); - } + catch (...) { + return -1; } - - QuadBLAS::aligned_free(temp_a); - QuadBLAS::aligned_free(temp_b); - - return (PyObject *)result; } -PyObject * -py_quadblas_dot(PyObject *self, PyObject *args) +int +qblas_supports_backend(QuadBackendType backend) { - PyObject *a_obj, *b_obj; - - if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) { - return nullptr; - } - - PyArrayObject *a = (PyArrayObject *)PyArray_FROM_OF(a_obj, NPY_ARRAY_ALIGNED); - PyArrayObject *b = (PyArrayObject *)PyArray_FROM_OF(b_obj, NPY_ARRAY_ALIGNED); - - if (!a || !b) { - Py_XDECREF(a); - Py_XDECREF(b); - PyErr_SetString(PyExc_TypeError, "Inputs must be convertible to arrays"); - return nullptr; - } - - PyObject *result = nullptr; - - int ndim_a = PyArray_NDIM(a); - int ndim_b = PyArray_NDIM(b); - - if (ndim_a == 1 && ndim_b == 1) { - result = dot_vector_vector(a, b); - } - else if (ndim_a == 2 && ndim_b == 1) { - result = dot_matrix_vector(a, b); - } - else if (ndim_a == 2 && ndim_b == 2) { - result = dot_matrix_matrix(a, b); - } - else if (ndim_a == 1 && ndim_b == 2) { - PyErr_SetString(PyExc_ValueError, - "Vector-Matrix multiplication not supported (use Matrix-Vector instead)"); - } - else { - PyErr_SetString(PyExc_ValueError, - "Unsupported array dimensions. Supported: (1D,1D), (2D,1D), (2D,2D)"); - } - - Py_DECREF(a); - Py_DECREF(b); - - return result; + // QBLAS only supports SLEEF backend + return (backend == BACKEND_SLEEF) ? 1 : 0; } PyObject * py_quadblas_set_num_threads(PyObject *self, PyObject *args) { int num_threads; - if (!PyArg_ParseTuple(args, "i", &num_threads)) { - return nullptr; + return NULL; } - if (num_threads < 1) { + if (num_threads <= 0) { PyErr_SetString(PyExc_ValueError, "Number of threads must be positive"); - return nullptr; + return NULL; } QuadBLAS::set_num_threads(num_threads); @@ -777,13 +130,14 @@ py_quadblas_set_num_threads(PyObject *self, PyObject *args) PyObject * py_quadblas_get_num_threads(PyObject *self, PyObject *args) { - return PyLong_FromLong(QuadBLAS::get_num_threads()); + int num_threads = QuadBLAS::get_num_threads(); + return PyLong_FromLong(num_threads); } PyObject * py_quadblas_get_version(PyObject *self, PyObject *args) { - return PyUnicode_FromString(QuadBLAS::VERSION); + return PyUnicode_FromString("QuadBLAS 1.0.0 - High Performance Quad Precision BLAS"); } -#endif // DISABLE_QUADBLAS \ No newline at end of file +} // extern "C" \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.h b/quaddtype/numpy_quaddtype/src/quadblas_interface.h index da8f0a85..ff9ed537 100644 --- a/quaddtype/numpy_quaddtype/src/quadblas_interface.h +++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.h @@ -1,23 +1,39 @@ -#ifndef _QUADDTYPE_QUADBLAS_INTERFACE_H -#define _QUADDTYPE_QUADBLAS_INTERFACE_H +#ifndef QUADBLAS_INTERFACE_H +#define QUADBLAS_INTERFACE_H + +#include +#include +#include "quad_common.h" +#include #ifdef __cplusplus extern "C" { #endif -#include - +int +qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result); -PyObject* py_quadblas_dot(PyObject* self, PyObject* args); +int +qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A, + size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy); +int +qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha, + Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C, + size_t ldc); -PyObject* py_quadblas_set_num_threads(PyObject* self, PyObject* args); -PyObject* py_quadblas_get_num_threads(PyObject* self, PyObject* args); +int +qblas_supports_backend(QuadBackendType backend); -PyObject* py_quadblas_get_version(PyObject* self, PyObject* args); +PyObject * +py_quadblas_set_num_threads(PyObject *self, PyObject *args); +PyObject * +py_quadblas_get_num_threads(PyObject *self, PyObject *args); +PyObject * +py_quadblas_get_version(PyObject *self, PyObject *args); #ifdef __cplusplus } #endif -#endif \ No newline at end of file +#endif // QUADBLAS_INTERFACE_H \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp index 00cc8589..d1adc7f3 100644 --- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp +++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp @@ -22,43 +22,36 @@ extern "C" { #include "../ops.hpp" #include "matmul.h" #include "promoters.hpp" +#include "../quadblas_interface.h" /** * Resolve descriptors for matmul operation. - * Follows the same pattern as binary_ops.cpp + * Only supports SLEEF backend when QBLAS is enabled. */ static NPY_CASTING quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[], PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset)) { - // Follow the exact same pattern as quad_binary_op_resolve_descriptors QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0]; QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1]; - QuadBackendType target_backend; - // Determine target backend and if casting is needed - NPY_CASTING casting = NPY_NO_CASTING; - if (descr_in1->backend != descr_in2->backend) { - target_backend = BACKEND_LONGDOUBLE; - casting = NPY_SAFE_CASTING; - } - else { - target_backend = descr_in1->backend; + // QBLAS only supports SLEEF backend + if (descr_in1->backend != BACKEND_SLEEF || descr_in2->backend != BACKEND_SLEEF) { + PyErr_SetString(PyExc_NotImplementedError, + "QBLAS-accelerated matmul only supports SLEEF backend. " + "Other backends are not supported with QBLAS."); + return (NPY_CASTING)-1; } - // Set up input descriptors, casting if necessary + // Both inputs must use SLEEF backend + QuadBackendType target_backend = BACKEND_SLEEF; + NPY_CASTING casting = NPY_NO_CASTING; + + // Set up input descriptors for (int i = 0; i < 2; i++) { - if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) { - loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend); - if (!loop_descrs[i]) { - return (NPY_CASTING)-1; - } - } - else { - Py_INCREF(given_descrs[i]); - loop_descrs[i] = given_descrs[i]; - } + Py_INCREF(given_descrs[i]); + loop_descrs[i] = given_descrs[i]; } // Set up output descriptor @@ -71,10 +64,9 @@ quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[ else { QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2]; if (descr_out->backend != target_backend) { - loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend); - if (!loop_descrs[2]) { - return (NPY_CASTING)-1; - } + PyErr_SetString(PyExc_NotImplementedError, + "QBLAS-accelerated matmul only supports SLEEF backend for output."); + return (NPY_CASTING)-1; } else { Py_INCREF(given_descrs[2]); @@ -85,117 +77,166 @@ quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[ } /** - * Matrix multiplication strided loop using NumPy 2.0 API. - * Implements general matrix multiplication for arbitrary dimensions. - * - * For matmul with signature (m?,n),(n,p?)->(m?,p?): - * - dimensions[0] = N (loop dimension, number of batch operations) - * - dimensions[1] = m (rows of first matrix) - * - dimensions[2] = n (cols of first matrix / rows of second matrix) - * - dimensions[3] = p (cols of second matrix) - * - * - strides[0], strides[1], strides[2] = batch strides for A, B, C - * - strides[3], strides[4] = row stride, col stride for A (m, n) - * - strides[5], strides[6] = row stride, col stride for B (n, p) - * - strides[7], strides[8] = row stride, col stride for C (m, p) + * Determine the type of operation based on input dimensions + */ +enum MatmulOperationType { + MATMUL_DOT, // 1D x 1D -> scalar + MATMUL_GEMV, // 2D x 1D -> 1D + MATMUL_GEMM // 2D x 2D -> 2D +}; + +static MatmulOperationType +determine_operation_type(npy_intp m, npy_intp n, npy_intp p) +{ + // For matmul signature (m?,n),(n,p?)->(m?,p?): + // - If m=1 and p=1: vector dot product (1D x 1D) + // - If p=1: matrix-vector multiplication (2D x 1D) + // - Otherwise: matrix-matrix multiplication (2D x 2D) + + if (m == 1 && p == 1) { + return MATMUL_DOT; + } + else if (p == 1) { + return MATMUL_GEMV; + } + else { + return MATMUL_GEMM; + } +} + +/** + * Matrix multiplication strided loop using QBLAS. + * Automatically selects the appropriate QBLAS operation based on input dimensions. */ static int quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata) { // Extract dimensions - npy_intp N = dimensions[0]; // Number of batch operations + npy_intp N = dimensions[0]; // Batch size, this remains always 1 for matmul afaik npy_intp m = dimensions[1]; // Rows of first matrix npy_intp n = dimensions[2]; // Cols of first matrix / rows of second matrix npy_intp p = dimensions[3]; // Cols of second matrix // Extract batch strides - npy_intp A_batch_stride = strides[0]; - npy_intp B_batch_stride = strides[1]; - npy_intp C_batch_stride = strides[2]; + npy_intp A_stride = strides[0]; + npy_intp B_stride = strides[1]; + npy_intp C_stride = strides[2]; // Extract core strides for matrix dimensions - npy_intp A_row_stride = strides[3]; // Stride along m dimension of A - npy_intp A_col_stride = strides[4]; // Stride along n dimension of A - npy_intp B_row_stride = strides[5]; // Stride along n dimension of B - npy_intp B_col_stride = strides[6]; // Stride along p dimension of B - npy_intp C_row_stride = strides[7]; // Stride along m dimension of C - npy_intp C_col_stride = strides[8]; // Stride along p dimension of C - - // Get backend from descriptor + npy_intp A_row_stride = strides[3]; + npy_intp A_col_stride = strides[4]; + npy_intp B_row_stride = strides[5]; + npy_intp B_col_stride = strides[6]; + npy_intp C_row_stride = strides[7]; + npy_intp C_col_stride = strides[8]; + + // Note: B_col_stride and C_col_stride not needed for row-major QBLAS calls + + // Get backend from descriptor (should be SLEEF only) QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; - QuadBackendType backend = descr->backend; - size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double); - - // Process each batch - for (npy_intp batch = 0; batch < N; batch++) { - char *A_batch = data[0] + batch * A_batch_stride; - char *B_batch = data[1] + batch * B_batch_stride; - char *C_batch = data[2] + batch * C_batch_stride; - - // Perform matrix multiplication: C = A @ B - // C[i,j] = sum_k(A[i,k] * B[k,j]) - for (npy_intp i = 0; i < m; i++) { - for (npy_intp j = 0; j < p; j++) { - char *C_ij = C_batch + i * C_row_stride + j * C_col_stride; - - if (backend == BACKEND_SLEEF) { - Sleef_quad sum = Sleef_cast_from_doubleq1(0.0); // Initialize to 0 - - for (npy_intp k = 0; k < n; k++) { - char *A_ik = A_batch + i * A_row_stride + k * A_col_stride; - char *B_kj = B_batch + k * B_row_stride + j * B_col_stride; - - Sleef_quad a_val, b_val; - memcpy(&a_val, A_ik, sizeof(Sleef_quad)); - memcpy(&b_val, B_kj, sizeof(Sleef_quad)); - - // sum += A[i,k] * B[k,j] - sum = Sleef_addq1_u05(sum, Sleef_mulq1_u05(a_val, b_val)); - } - - memcpy(C_ij, &sum, sizeof(Sleef_quad)); - } - else { - // Long double backend - long double sum = 0.0L; - - for (npy_intp k = 0; k < n; k++) { - char *A_ik = A_batch + i * A_row_stride + k * A_col_stride; - char *B_kj = B_batch + k * B_row_stride + j * B_col_stride; - - long double a_val, b_val; - memcpy(&a_val, A_ik, sizeof(long double)); - memcpy(&b_val, B_kj, sizeof(long double)); - - sum += a_val * b_val; - } - - memcpy(C_ij, &sum, sizeof(long double)); - } - } + if (descr->backend != BACKEND_SLEEF) { + PyErr_SetString(PyExc_RuntimeError, "Internal error: non-SLEEF backend in QBLAS matmul"); + return -1; + } + + // Determine operation type + MatmulOperationType op_type = determine_operation_type(m, n, p); + + // Constants for QBLAS + Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0); + Sleef_quad beta = Sleef_cast_from_doubleq1(0.0); + + // print all information for debugging + printf("DEBUG: Performing %ld batch operations with dimensions (%ld, %ld, %ld)\n", (long)N, + (long)m, (long)n, (long)p); + printf("DEBUG: Strides - A: (%ld, %ld), B: (%ld, %ld), C: (%ld, %ld)\n", (long)A_row_stride, + (long)A_col_stride, (long)B_row_stride, (long)B_col_stride, (long)C_row_stride, + (long)C_col_stride); + printf("DEBUG: Operation type: %d\n", op_type); + + char *A = data[0]; + char *B = data[1]; + char *C = data[2]; + + Sleef_quad *A_ptr = (Sleef_quad *)A; + Sleef_quad *B_ptr = (Sleef_quad *)B; + Sleef_quad *C_ptr = (Sleef_quad *)C; + + int result = -1; + + switch (op_type) { + case MATMUL_DOT: { + // Vector dot product: C = A^T * B (both are vectors) + // A has shape (1, n), B has shape (n, 1), C has shape (1, 1) + + printf("DEBUG: Using QBLAS dot product for %ld elements\n", (long)n); + + // A is effectively a vector of length n + // B is effectively a vector of length n + size_t incx = A_col_stride / sizeof(Sleef_quad); + size_t incy = B_row_stride / sizeof(Sleef_quad); + + result = qblas_dot(n, A_ptr, incx, B_ptr, incy, C_ptr); + break; } + + case MATMUL_GEMV: { + // Matrix-vector multiplication: C = A * B + // A has shape (m, n), B has shape (n, 1), C has shape (m, 1) + + printf("DEBUG: Using QBLAS GEMV for %ldx%ld matrix times %ld vector\n", (long)m, + (long)n, (long)n); + + size_t lda = A_row_stride / sizeof(Sleef_quad); + size_t incx = B_row_stride / sizeof(Sleef_quad); + size_t incy = C_row_stride / sizeof(Sleef_quad); + + result = + qblas_gemv('R', 'N', m, n, &alpha, A_ptr, lda, B_ptr, incx, &beta, C_ptr, incy); + break; + } + + case MATMUL_GEMM: { + // Matrix-matrix multiplication: C = A * B + // A has shape (m, n), B has shape (n, p), C has shape (m, p) + + printf("DEBUG: Using QBLAS GEMM for %ldx%ldx%ld matrices\n", (long)m, (long)n, (long)p); + + size_t lda = A_row_stride / sizeof(Sleef_quad); + size_t ldb = B_row_stride / sizeof(Sleef_quad); + size_t ldc = C_row_stride / sizeof(Sleef_quad); + + result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, &beta, + C_ptr, ldc); + break; + } + } + + if (result != 0) { + PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed"); + return -1; } return 0; } /** - * Register matmul support following the exact same pattern as binary_ops.cpp + * Register matmul support with QBLAS acceleration */ int init_matmul_ops(PyObject *numpy) { - printf("DEBUG: init_matmul_ops - registering matmul using NumPy 2.0 API\n"); + printf("DEBUG: init_matmul_ops - registering QBLAS-accelerated matmul\n"); - // Get the existing matmul ufunc - same pattern as binary_ops + // Get the existing matmul ufunc PyObject *ufunc = PyObject_GetAttrString(numpy, "matmul"); if (ufunc == NULL) { printf("DEBUG: Failed to get numpy.matmul\n"); return -1; } - // Use the same pattern as binary_ops.cpp + // Setup method specification for QBLAS-accelerated matmul PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType}; PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors}, @@ -204,7 +245,7 @@ init_matmul_ops(PyObject *numpy) {0, NULL}}; PyArrayMethod_Spec Spec = { - .name = "quad_matmul", + .name = "quad_matmul_qblas", .nin = 2, .nout = 1, .casting = NPY_NO_CASTING, @@ -213,17 +254,17 @@ init_matmul_ops(PyObject *numpy) .slots = slots, }; - printf("DEBUG: About to add loop to matmul ufunc...\n"); + printf("DEBUG: About to add QBLAS loop to matmul ufunc...\n"); if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) { - printf("DEBUG: Failed to add loop to matmul ufunc\n"); + printf("DEBUG: Failed to add QBLAS loop to matmul ufunc\n"); Py_DECREF(ufunc); return -1; } - printf("DEBUG: Successfully added matmul loop!\n"); + printf("DEBUG: Successfully added QBLAS matmul loop!\n"); - // Add promoter following binary_ops pattern + // Add promoter PyObject *promoter_capsule = PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL); if (promoter_capsule == NULL) { @@ -250,6 +291,6 @@ init_matmul_ops(PyObject *numpy) Py_DECREF(promoter_capsule); Py_DECREF(ufunc); - printf("DEBUG: init_matmul_ops completed successfully\n"); + printf("DEBUG: init_matmul_ops completed successfully with QBLAS acceleration\n"); return 0; } \ No newline at end of file From 742ce642aa173487a8253da10518e30419621ec3 Mon Sep 17 00:00:00 2001 From: swayaminsync Date: Sat, 19 Jul 2025 17:38:59 +0530 Subject: [PATCH 13/20] matmul ufunc completed, naive plugged, qblas experimental --- quaddtype/numpy_quaddtype/QBLAS | 2 +- .../src/quadblas_interface.cpp | 2 - .../numpy_quaddtype/src/umath/matmul.cpp | 189 +++++++++++------- 3 files changed, 116 insertions(+), 77 deletions(-) diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS index 4853ac1c..9468e24a 160000 --- a/quaddtype/numpy_quaddtype/QBLAS +++ b/quaddtype/numpy_quaddtype/QBLAS @@ -1 +1 @@ -Subproject commit 4853ac1c7d3fa3016b61e9f2b9a43f49c06d891d +Subproject commit 9468e24a02b731563eba2aee0350e9219b36c102 diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp index 185e2d87..7ef618fd 100644 --- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp +++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp @@ -75,7 +75,6 @@ qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, } try { - // Convert layout QuadBLAS::Layout qblas_layout; if (layout == 'R' || layout == 'r') { qblas_layout = QuadBLAS::Layout::RowMajor; @@ -93,7 +92,6 @@ qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, return -1; // Transpose not implemented yet } - // Call QBLAS GEMM QuadBLAS::gemm(qblas_layout, m, n, k, *alpha, A, lda, B, ldb, *beta, C, ldc); return 0; diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp index d1adc7f3..e192a670 100644 --- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp +++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp @@ -24,10 +24,6 @@ extern "C" { #include "promoters.hpp" #include "../quadblas_interface.h" -/** - * Resolve descriptors for matmul operation. - * Only supports SLEEF backend when QBLAS is enabled. - */ static NPY_CASTING quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[], PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[], @@ -76,23 +72,15 @@ quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[ return casting; } -/** - * Determine the type of operation based on input dimensions - */ enum MatmulOperationType { - MATMUL_DOT, // 1D x 1D -> scalar - MATMUL_GEMV, // 2D x 1D -> 1D - MATMUL_GEMM // 2D x 2D -> 2D + MATMUL_DOT, + MATMUL_GEMV, + MATMUL_GEMM }; static MatmulOperationType determine_operation_type(npy_intp m, npy_intp n, npy_intp p) { - // For matmul signature (m?,n),(n,p?)->(m?,p?): - // - If m=1 and p=1: vector dot product (1D x 1D) - // - If p=1: matrix-vector multiplication (2D x 1D) - // - Otherwise: matrix-matrix multiplication (2D x 2D) - if (m == 1 && p == 1) { return MATMUL_DOT; } @@ -104,10 +92,6 @@ determine_operation_type(npy_intp m, npy_intp n, npy_intp p) } } -/** - * Matrix multiplication strided loop using QBLAS. - * Automatically selects the appropriate QBLAS operation based on input dimensions. - */ static int quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata) @@ -118,12 +102,12 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], npy_intp n = dimensions[2]; // Cols of first matrix / rows of second matrix npy_intp p = dimensions[3]; // Cols of second matrix - // Extract batch strides + // batch strides npy_intp A_stride = strides[0]; npy_intp B_stride = strides[1]; npy_intp C_stride = strides[2]; - // Extract core strides for matrix dimensions + // core strides for matrix dimensions npy_intp A_row_stride = strides[3]; npy_intp A_col_stride = strides[4]; npy_intp B_row_stride = strides[5]; @@ -131,30 +115,16 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], npy_intp C_row_stride = strides[7]; npy_intp C_col_stride = strides[8]; - // Note: B_col_stride and C_col_stride not needed for row-major QBLAS calls - - // Get backend from descriptor (should be SLEEF only) QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; if (descr->backend != BACKEND_SLEEF) { PyErr_SetString(PyExc_RuntimeError, "Internal error: non-SLEEF backend in QBLAS matmul"); return -1; } - // Determine operation type MatmulOperationType op_type = determine_operation_type(m, n, p); - - // Constants for QBLAS Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0); Sleef_quad beta = Sleef_cast_from_doubleq1(0.0); - // print all information for debugging - printf("DEBUG: Performing %ld batch operations with dimensions (%ld, %ld, %ld)\n", (long)N, - (long)m, (long)n, (long)p); - printf("DEBUG: Strides - A: (%ld, %ld), B: (%ld, %ld), C: (%ld, %ld)\n", (long)A_row_stride, - (long)A_col_stride, (long)B_row_stride, (long)B_col_stride, (long)C_row_stride, - (long)C_col_stride); - printf("DEBUG: Operation type: %d\n", op_type); - char *A = data[0]; char *B = data[1]; char *C = data[2]; @@ -167,13 +137,6 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], switch (op_type) { case MATMUL_DOT: { - // Vector dot product: C = A^T * B (both are vectors) - // A has shape (1, n), B has shape (n, 1), C has shape (1, 1) - - printf("DEBUG: Using QBLAS dot product for %ld elements\n", (long)n); - - // A is effectively a vector of length n - // B is effectively a vector of length n size_t incx = A_col_stride / sizeof(Sleef_quad); size_t incy = B_row_stride / sizeof(Sleef_quad); @@ -182,12 +145,6 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], } case MATMUL_GEMV: { - // Matrix-vector multiplication: C = A * B - // A has shape (m, n), B has shape (n, 1), C has shape (m, 1) - - printf("DEBUG: Using QBLAS GEMV for %ldx%ld matrix times %ld vector\n", (long)m, - (long)n, (long)n); - size_t lda = A_row_stride / sizeof(Sleef_quad); size_t incx = B_row_stride / sizeof(Sleef_quad); size_t incy = C_row_stride / sizeof(Sleef_quad); @@ -198,17 +155,46 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], } case MATMUL_GEMM: { - // Matrix-matrix multiplication: C = A * B - // A has shape (m, n), B has shape (n, p), C has shape (m, p) - - printf("DEBUG: Using QBLAS GEMM for %ldx%ldx%ld matrices\n", (long)m, (long)n, (long)p); - size_t lda = A_row_stride / sizeof(Sleef_quad); size_t ldb = B_row_stride / sizeof(Sleef_quad); - size_t ldc = C_row_stride / sizeof(Sleef_quad); + size_t ldc_numpy = C_row_stride / sizeof(Sleef_quad); + + Sleef_quad *temp_A_buffer = new Sleef_quad[m * n]; + if (!temp_A_buffer) { + PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffer for GEMM"); + delete[] temp_A_buffer; + return -1; + } + Sleef_quad *temp_B_buffer = new Sleef_quad[n * p]; + if (!temp_B_buffer) { + PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffer for GEMM"); + delete[] temp_A_buffer; + return -1; + } + memcpy(temp_A_buffer, A_ptr, m * n * sizeof(Sleef_quad)); + memcpy(temp_B_buffer, B_ptr, n * p * sizeof(Sleef_quad)); + A_ptr = temp_A_buffer; + B_ptr = temp_B_buffer; + + Sleef_quad *temp_C_buffer = new Sleef_quad[m * p]; + if (!temp_C_buffer) { + PyErr_SetString(PyExc_MemoryError, + "Failed to allocate temporary buffer for GEMM result"); + return -1; + } + + size_t ldc_temp = p; result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, &beta, - C_ptr, ldc); + temp_C_buffer, ldc_temp); + + if (result == 0) { + memcpy(C_ptr, temp_C_buffer, m * p * sizeof(Sleef_quad)); + } + + delete[] temp_C_buffer; + delete[] temp_A_buffer; + delete[] temp_B_buffer; break; } } @@ -221,27 +207,91 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], return 0; } -/** - * Register matmul support with QBLAS acceleration - */ +static int +naive_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) +{ + npy_intp N = dimensions[0]; + npy_intp m = dimensions[1]; + npy_intp n = dimensions[2]; + npy_intp p = dimensions[3]; + + npy_intp A_batch_stride = strides[0]; + npy_intp B_batch_stride = strides[1]; + npy_intp C_batch_stride = strides[2]; + + npy_intp A_row_stride = strides[3]; + npy_intp A_col_stride = strides[4]; + npy_intp B_row_stride = strides[5]; + npy_intp B_col_stride = strides[6]; + npy_intp C_row_stride = strides[7]; + npy_intp C_col_stride = strides[8]; + + QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; + QuadBackendType backend = descr->backend; + size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double); + + for (npy_intp batch = 0; batch < N; batch++) { + char *A_batch = data[0] + batch * A_batch_stride; + char *B_batch = data[1] + batch * B_batch_stride; + char *C_batch = data[2] + batch * C_batch_stride; + + for (npy_intp i = 0; i < m; i++) { + for (npy_intp j = 0; j < p; j++) { + char *C_ij = C_batch + i * C_row_stride + j * C_col_stride; + + if (backend == BACKEND_SLEEF) { + Sleef_quad sum = Sleef_cast_from_doubleq1(0.0); + + for (npy_intp k = 0; k < n; k++) { + char *A_ik = A_batch + i * A_row_stride + k * A_col_stride; + char *B_kj = B_batch + k * B_row_stride + j * B_col_stride; + + Sleef_quad a_val, b_val; + memcpy(&a_val, A_ik, sizeof(Sleef_quad)); + memcpy(&b_val, B_kj, sizeof(Sleef_quad)); + sum = Sleef_fmaq1_u05(a_val, b_val, sum); + } + + memcpy(C_ij, &sum, sizeof(Sleef_quad)); + } + else { + long double sum = 0.0L; + + for (npy_intp k = 0; k < n; k++) { + char *A_ik = A_batch + i * A_row_stride + k * A_col_stride; + char *B_kj = B_batch + k * B_row_stride + j * B_col_stride; + + long double a_val, b_val; + memcpy(&a_val, A_ik, sizeof(long double)); + memcpy(&b_val, B_kj, sizeof(long double)); + + sum += a_val * b_val; + } + + memcpy(C_ij, &sum, sizeof(long double)); + } + } + } + } + + return 0; +} + int init_matmul_ops(PyObject *numpy) { - printf("DEBUG: init_matmul_ops - registering QBLAS-accelerated matmul\n"); - - // Get the existing matmul ufunc PyObject *ufunc = PyObject_GetAttrString(numpy, "matmul"); if (ufunc == NULL) { - printf("DEBUG: Failed to get numpy.matmul\n"); return -1; } - // Setup method specification for QBLAS-accelerated matmul PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType}; PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors}, - {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop}, - {NPY_METH_unaligned_strided_loop, (void *)&quad_matmul_strided_loop}, + {NPY_METH_strided_loop, (void *)&naive_matmul_strided_loop}, + {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop}, {0, NULL}}; PyArrayMethod_Spec Spec = { @@ -254,17 +304,11 @@ init_matmul_ops(PyObject *numpy) .slots = slots, }; - printf("DEBUG: About to add QBLAS loop to matmul ufunc...\n"); - if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) { - printf("DEBUG: Failed to add QBLAS loop to matmul ufunc\n"); Py_DECREF(ufunc); return -1; } - printf("DEBUG: Successfully added QBLAS matmul loop!\n"); - - // Add promoter PyObject *promoter_capsule = PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL); if (promoter_capsule == NULL) { @@ -280,17 +324,14 @@ init_matmul_ops(PyObject *numpy) } if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) { - printf("DEBUG: Failed to add promoter (continuing anyway)\n"); PyErr_Clear(); // Don't fail if promoter fails } else { - printf("DEBUG: Successfully added promoter\n"); } Py_DECREF(DTypes); Py_DECREF(promoter_capsule); Py_DECREF(ufunc); - printf("DEBUG: init_matmul_ops completed successfully with QBLAS acceleration\n"); return 0; } \ No newline at end of file From d993bc92bb760c4e0428ac9542ed97272247a4af Mon Sep 17 00:00:00 2001 From: swayaminsync Date: Sat, 19 Jul 2025 17:47:11 +0530 Subject: [PATCH 14/20] adding release tracker to keep record for tasks, v1.0.0 --- quaddtype/release_tracker.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/quaddtype/release_tracker.md b/quaddtype/release_tracker.md index 1ecf7d3a..3ed10042 100644 --- a/quaddtype/release_tracker.md +++ b/quaddtype/release_tracker.md @@ -1,4 +1,4 @@ -# Plan for `numpy-quaddtype` v1.5 +# Plan for `numpy-quaddtype` v1.0.0 | ufunc name | Added | | ------------- | ----- | @@ -91,3 +91,5 @@ | floor | ✅ | | ceil | ✅ | | trunc | ✅ | + +- Fixing QBLAS integration to work unaligned arrays without or recovering from bad allocation fallback From c518a29281de14955e8125cb8845200d9a8b46a9 Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Sat, 19 Jul 2025 15:33:46 +0000 Subject: [PATCH 15/20] it should be failing but passes on x86-64 --- quaddtype/numpy_quaddtype/QBLAS | 2 +- quaddtype/numpy_quaddtype/src/umath/matmul.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS index 9468e24a..0eabb677 160000 --- a/quaddtype/numpy_quaddtype/QBLAS +++ b/quaddtype/numpy_quaddtype/QBLAS @@ -1 +1 @@ -Subproject commit 9468e24a02b731563eba2aee0350e9219b36c102 +Subproject commit 0eabb677431c6148434c50deba7abd6902d74b16 diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp index e192a670..f31ec89f 100644 --- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp +++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp @@ -290,7 +290,7 @@ init_matmul_ops(PyObject *numpy) PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType}; PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors}, - {NPY_METH_strided_loop, (void *)&naive_matmul_strided_loop}, + {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop}, {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop}, {0, NULL}}; From bbce2ac58f930ad04a89ea877ef2013f5b49c8c7 Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Sat, 19 Jul 2025 18:17:04 +0000 Subject: [PATCH 16/20] ahh stupid me :), fallback to naive for MSVC --- quaddtype/numpy_quaddtype/__init__.py | 6 +- .../src/quadblas_interface.cpp | 94 ++++++++++++++++++- .../numpy_quaddtype/src/quadblas_interface.h | 5 + .../numpy_quaddtype/src/umath/matmul.cpp | 11 +++ 4 files changed, 110 insertions(+), 6 deletions(-) diff --git a/quaddtype/numpy_quaddtype/__init__.py b/quaddtype/numpy_quaddtype/__init__.py index 8b588c11..878180bc 100644 --- a/quaddtype/numpy_quaddtype/__init__.py +++ b/quaddtype/numpy_quaddtype/__init__.py @@ -39,8 +39,4 @@ def LongDoubleQuadPrecDType(): ln10 = get_sleef_constant("ln10") max_value = get_sleef_constant("quad_max") min_value = get_sleef_constant("quad_min") -epsilon = get_sleef_constant("epsilon") - -num_cores = multiprocessing.cpu_count() -# set default number of threads for QuadBLAS -set_num_threads(num_cores) \ No newline at end of file +epsilon = get_sleef_constant("epsilon") \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp index 7ef618fd..6eb57574 100644 --- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp +++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp @@ -1,10 +1,16 @@ #include "quadblas_interface.h" -#include "../QBLAS/include/quadblas/quadblas.hpp" #include #include +#ifndef DISABLE_QUADBLAS +#include "../QBLAS/include/quadblas/quadblas.hpp" +#endif // DISABLE_QUADBLAS + extern "C" { + +#ifndef DISABLE_QUADBLAS + int qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result) { @@ -138,4 +144,90 @@ py_quadblas_get_version(PyObject *self, PyObject *args) return PyUnicode_FromString("QuadBLAS 1.0.0 - High Performance Quad Precision BLAS"); } +int +quadblas_set_num_threads(int num_threads) +{ + QuadBLAS::set_num_threads(num_threads); + return 0; +} + +int +quadblas_get_num_threads(void) +{ + int num_threads = QuadBLAS::get_num_threads(); + return num_threads; +} + +#else // DISABLE_QUADBLAS + + +int +qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result) +{ + return -1; // QBLAS is disabled, dot product not available +} + +int +qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A, + size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy) +{ + return -1; // QBLAS is disabled, GEMV not available +} + +int +qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha, + Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C, + size_t ldc) +{ + return -1; // QBLAS is disabled, GEMM not available +} + +int +qblas_supports_backend(QuadBackendType backend) +{ + return -1; // QBLAS is disabled, backend support not available +} + +PyObject * +py_quadblas_set_num_threads(PyObject *self, PyObject *args) +{ + // raise error + PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); + return NULL; +} + +PyObject * +py_quadblas_get_num_threads(PyObject *self, PyObject *args) +{ + // raise error + PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); + return NULL; +} + +PyObject * +py_quadblas_get_version(PyObject *self, PyObject *args) +{ + // raise error + PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); + return NULL; +} + +int +quadblas_set_num_threads(int num_threads) +{ + // raise error + PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); + return -1; +} + +int +quadblas_get_num_threads(void) +{ + // raise error + PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); + return -1; +} + +#endif // DISABLE_QUADBLAS + } // extern "C" \ No newline at end of file diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.h b/quaddtype/numpy_quaddtype/src/quadblas_interface.h index ff9ed537..82a685a9 100644 --- a/quaddtype/numpy_quaddtype/src/quadblas_interface.h +++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.h @@ -32,6 +32,11 @@ py_quadblas_get_num_threads(PyObject *self, PyObject *args); PyObject * py_quadblas_get_version(PyObject *self, PyObject *args); +int +quadblas_set_num_threads(int num_threads); +int +quadblas_get_num_threads(void); + #ifdef __cplusplus } #endif diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp index f31ec89f..c9fec506 100644 --- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp +++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp @@ -289,10 +289,21 @@ init_matmul_ops(PyObject *numpy) PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType}; + #ifndef DISABLE_QUADBLAS + // set threading to max + int num_threads = quadblas_get_num_threads(); + quadblas_set_num_threads(num_threads); + PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors}, {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop}, {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop}, {0, NULL}}; + #else + PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors}, + {NPY_METH_strided_loop, (void *)&naive_matmul_strided_loop}, + {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop}, + {0, NULL}}; + #endif // DISABLE_QUADBLAS PyArrayMethod_Spec Spec = { .name = "quad_matmul_qblas", From 5e5fa659fcce5e3daa3a1d7cd771244290ea8531 Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Sat, 19 Jul 2025 18:26:27 +0000 Subject: [PATCH 17/20] switching to internal function use only --- quaddtype/numpy_quaddtype/src/quadblas_interface.cpp | 8 ++++---- quaddtype/numpy_quaddtype/src/quadblas_interface.h | 4 ++-- quaddtype/numpy_quaddtype/src/umath/matmul.cpp | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp index 6eb57574..65feb604 100644 --- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp +++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp @@ -145,14 +145,14 @@ py_quadblas_get_version(PyObject *self, PyObject *args) } int -quadblas_set_num_threads(int num_threads) +_quadblas_set_num_threads(int num_threads) { QuadBLAS::set_num_threads(num_threads); return 0; } int -quadblas_get_num_threads(void) +_quadblas_get_num_threads(void) { int num_threads = QuadBLAS::get_num_threads(); return num_threads; @@ -213,7 +213,7 @@ py_quadblas_get_version(PyObject *self, PyObject *args) } int -quadblas_set_num_threads(int num_threads) +_quadblas_set_num_threads(int num_threads) { // raise error PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); @@ -221,7 +221,7 @@ quadblas_set_num_threads(int num_threads) } int -quadblas_get_num_threads(void) +_quadblas_get_num_threads(void) { // raise error PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled"); diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.h b/quaddtype/numpy_quaddtype/src/quadblas_interface.h index 82a685a9..76033ebc 100644 --- a/quaddtype/numpy_quaddtype/src/quadblas_interface.h +++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.h @@ -33,9 +33,9 @@ PyObject * py_quadblas_get_version(PyObject *self, PyObject *args); int -quadblas_set_num_threads(int num_threads); +_quadblas_set_num_threads(int num_threads); int -quadblas_get_num_threads(void); +_quadblas_get_num_threads(void); #ifdef __cplusplus } diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp index c9fec506..98053b82 100644 --- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp +++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp @@ -291,9 +291,9 @@ init_matmul_ops(PyObject *numpy) #ifndef DISABLE_QUADBLAS // set threading to max - int num_threads = quadblas_get_num_threads(); - quadblas_set_num_threads(num_threads); - + int num_threads = _quadblas_get_num_threads(); + _quadblas_set_num_threads(num_threads); + PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors}, {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop}, {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop}, From cec5ace3a011cb38c57bf0cd33b99b5df0d36cc6 Mon Sep 17 00:00:00 2001 From: swayaminsync Date: Sun, 20 Jul 2025 03:46:52 +0530 Subject: [PATCH 18/20] this should fix them all --- quaddtype/numpy_quaddtype/QBLAS | 2 +- .../numpy_quaddtype/src/umath/matmul.cpp | 216 +++++++++++++----- 2 files changed, 160 insertions(+), 58 deletions(-) diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS index 0eabb677..9468e24a 160000 --- a/quaddtype/numpy_quaddtype/QBLAS +++ b/quaddtype/numpy_quaddtype/QBLAS @@ -1 +1 @@ -Subproject commit 0eabb677431c6148434c50deba7abd6902d74b16 +Subproject commit 9468e24a02b731563eba2aee0350e9219b36c102 diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp index 98053b82..354a342d 100644 --- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp +++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp @@ -93,8 +93,9 @@ determine_operation_type(npy_intp m, npy_intp n, npy_intp p) } static int -quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], - npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata) +quad_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) { // Extract dimensions npy_intp N = dimensions[0]; // Batch size, this remains always 1 for matmul afaik @@ -149,6 +150,8 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], size_t incx = B_row_stride / sizeof(Sleef_quad); size_t incy = C_row_stride / sizeof(Sleef_quad); + memset(C_ptr, 0, m * p * sizeof(Sleef_quad)); + result = qblas_gemv('R', 'N', m, n, &alpha, A_ptr, lda, B_ptr, incx, &beta, C_ptr, incy); break; @@ -159,32 +162,132 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], size_t ldb = B_row_stride / sizeof(Sleef_quad); size_t ldc_numpy = C_row_stride / sizeof(Sleef_quad); + memset(C_ptr, 0, m * p * sizeof(Sleef_quad)); + + size_t ldc_temp = p; + + result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, &beta, + C_ptr, ldc_numpy); + break; + } + } + + if (result != 0) { + PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed"); + return -1; + } + + return 0; +} + +static int +quad_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) +{ + // Extract dimensions + npy_intp N = dimensions[0]; // Batch size, this remains always 1 for matmul afaik + npy_intp m = dimensions[1]; // Rows of first matrix + npy_intp n = dimensions[2]; // Cols of first matrix / rows of second matrix + npy_intp p = dimensions[3]; // Cols of second matrix + + // batch strides + npy_intp A_stride = strides[0]; + npy_intp B_stride = strides[1]; + npy_intp C_stride = strides[2]; + + // core strides for matrix dimensions + npy_intp A_row_stride = strides[3]; + npy_intp A_col_stride = strides[4]; + npy_intp B_row_stride = strides[5]; + npy_intp B_col_stride = strides[6]; + npy_intp C_row_stride = strides[7]; + npy_intp C_col_stride = strides[8]; + + QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; + if (descr->backend != BACKEND_SLEEF) { + PyErr_SetString(PyExc_RuntimeError, "Internal error: non-SLEEF backend in QBLAS matmul"); + return -1; + } + + MatmulOperationType op_type = determine_operation_type(m, n, p); + Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0); + Sleef_quad beta = Sleef_cast_from_doubleq1(0.0); + + char *A = data[0]; + char *B = data[1]; + char *C = data[2]; + + Sleef_quad *A_ptr = (Sleef_quad *)A; + Sleef_quad *B_ptr = (Sleef_quad *)B; + Sleef_quad *C_ptr = (Sleef_quad *)C; + + int result = -1; + + switch (op_type) { + case MATMUL_DOT: { + Sleef_quad *temp_A_buffer = new Sleef_quad[n]; + Sleef_quad *temp_B_buffer = new Sleef_quad[n]; + + memcpy(temp_A_buffer, A_ptr, n * sizeof(Sleef_quad)); + memcpy(temp_B_buffer, B_ptr, n * sizeof(Sleef_quad)); + + size_t incx = 1; + size_t incy = 1; + + result = qblas_dot(n, temp_A_buffer, incx, temp_B_buffer, incy, C_ptr); + + delete[] temp_A_buffer; + delete[] temp_B_buffer; + break; + } + + case MATMUL_GEMV: { + size_t lda = A_row_stride / sizeof(Sleef_quad); + size_t incx = B_row_stride / sizeof(Sleef_quad); + size_t incy = C_row_stride / sizeof(Sleef_quad); + Sleef_quad *temp_A_buffer = new Sleef_quad[m * n]; - if (!temp_A_buffer) { - PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffer for GEMM"); - delete[] temp_A_buffer; - return -1; - } Sleef_quad *temp_B_buffer = new Sleef_quad[n * p]; - if (!temp_B_buffer) { - PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffer for GEMM"); - delete[] temp_A_buffer; - return -1; - } memcpy(temp_A_buffer, A_ptr, m * n * sizeof(Sleef_quad)); memcpy(temp_B_buffer, B_ptr, n * p * sizeof(Sleef_quad)); A_ptr = temp_A_buffer; B_ptr = temp_B_buffer; + // Use temp_C_buffer to avoid unaligned writes Sleef_quad *temp_C_buffer = new Sleef_quad[m * p]; - if (!temp_C_buffer) { - PyErr_SetString(PyExc_MemoryError, - "Failed to allocate temporary buffer for GEMM result"); - return -1; - } + lda = n; + incx = 1; + incy = 1; + + memset(temp_C_buffer, 0, m * p * sizeof(Sleef_quad)); + + result = qblas_gemv('R', 'N', m, n, &alpha, A_ptr, lda, B_ptr, incx, &beta, + temp_C_buffer, incy); + break; + } + + case MATMUL_GEMM: { + size_t lda = A_row_stride / sizeof(Sleef_quad); + size_t ldb = B_row_stride / sizeof(Sleef_quad); + size_t ldc_numpy = C_row_stride / sizeof(Sleef_quad); + + Sleef_quad *temp_A_buffer = new Sleef_quad[m * n]; + Sleef_quad *temp_B_buffer = new Sleef_quad[n * p]; + memcpy(temp_A_buffer, A_ptr, m * n * sizeof(Sleef_quad)); + memcpy(temp_B_buffer, B_ptr, n * p * sizeof(Sleef_quad)); + A_ptr = temp_A_buffer; + B_ptr = temp_B_buffer; + + // since these are now contiguous so, + lda = n; + ldb = p; size_t ldc_temp = p; + Sleef_quad *temp_C_buffer = new Sleef_quad[m * p]; + memset(temp_C_buffer, 0, m * p * sizeof(Sleef_quad)); + result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, &beta, temp_C_buffer, ldc_temp); @@ -218,8 +321,8 @@ naive_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], npy_intp p = dimensions[3]; npy_intp A_batch_stride = strides[0]; - npy_intp B_batch_stride = strides[1]; - npy_intp C_batch_stride = strides[2]; + npy_intp B_stride = strides[1]; + npy_intp C_stride = strides[2]; npy_intp A_row_stride = strides[3]; npy_intp A_col_stride = strides[4]; @@ -232,46 +335,44 @@ naive_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], QuadBackendType backend = descr->backend; size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double); - for (npy_intp batch = 0; batch < N; batch++) { - char *A_batch = data[0] + batch * A_batch_stride; - char *B_batch = data[1] + batch * B_batch_stride; - char *C_batch = data[2] + batch * C_batch_stride; - - for (npy_intp i = 0; i < m; i++) { - for (npy_intp j = 0; j < p; j++) { - char *C_ij = C_batch + i * C_row_stride + j * C_col_stride; + char *A = data[0]; + char *B = data[1]; + char *C = data[2]; - if (backend == BACKEND_SLEEF) { - Sleef_quad sum = Sleef_cast_from_doubleq1(0.0); + for (npy_intp i = 0; i < m; i++) { + for (npy_intp j = 0; j < p; j++) { + char *C_ij = C + i * C_row_stride + j * C_col_stride; - for (npy_intp k = 0; k < n; k++) { - char *A_ik = A_batch + i * A_row_stride + k * A_col_stride; - char *B_kj = B_batch + k * B_row_stride + j * B_col_stride; + if (backend == BACKEND_SLEEF) { + Sleef_quad sum = Sleef_cast_from_doubleq1(0.0); - Sleef_quad a_val, b_val; - memcpy(&a_val, A_ik, sizeof(Sleef_quad)); - memcpy(&b_val, B_kj, sizeof(Sleef_quad)); - sum = Sleef_fmaq1_u05(a_val, b_val, sum); - } + for (npy_intp k = 0; k < n; k++) { + char *A_ik = A + i * A_row_stride + k * A_col_stride; + char *B_kj = B + k * B_row_stride + j * B_col_stride; - memcpy(C_ij, &sum, sizeof(Sleef_quad)); + Sleef_quad a_val, b_val; + memcpy(&a_val, A_ik, sizeof(Sleef_quad)); + memcpy(&b_val, B_kj, sizeof(Sleef_quad)); + sum = Sleef_fmaq1_u05(a_val, b_val, sum); } - else { - long double sum = 0.0L; - for (npy_intp k = 0; k < n; k++) { - char *A_ik = A_batch + i * A_row_stride + k * A_col_stride; - char *B_kj = B_batch + k * B_row_stride + j * B_col_stride; + memcpy(C_ij, &sum, sizeof(Sleef_quad)); + } + else { + long double sum = 0.0L; - long double a_val, b_val; - memcpy(&a_val, A_ik, sizeof(long double)); - memcpy(&b_val, B_kj, sizeof(long double)); + for (npy_intp k = 0; k < n; k++) { + char *A_ik = A + i * A_row_stride + k * A_col_stride; + char *B_kj = B + k * B_row_stride + j * B_col_stride; - sum += a_val * b_val; - } + long double a_val, b_val; + memcpy(&a_val, A_ik, sizeof(long double)); + memcpy(&b_val, B_kj, sizeof(long double)); - memcpy(C_ij, &sum, sizeof(long double)); + sum += a_val * b_val; } + + memcpy(C_ij, &sum, sizeof(long double)); } } } @@ -289,21 +390,22 @@ init_matmul_ops(PyObject *numpy) PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType}; - #ifndef DISABLE_QUADBLAS +#ifndef DISABLE_QUADBLAS // set threading to max int num_threads = _quadblas_get_num_threads(); _quadblas_set_num_threads(num_threads); - PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors}, - {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop}, - {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop}, - {0, NULL}}; - #else + PyType_Slot slots[] = { + {NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors}, + {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop_aligned}, + {NPY_METH_unaligned_strided_loop, (void *)&quad_matmul_strided_loop_unaligned}, + {0, NULL}}; +#else PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors}, {NPY_METH_strided_loop, (void *)&naive_matmul_strided_loop}, {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop}, {0, NULL}}; - #endif // DISABLE_QUADBLAS +#endif // DISABLE_QUADBLAS PyArrayMethod_Spec Spec = { .name = "quad_matmul_qblas", @@ -335,7 +437,7 @@ init_matmul_ops(PyObject *numpy) } if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) { - PyErr_Clear(); // Don't fail if promoter fails + PyErr_Clear(); } else { } From 1fe6c8172582ec2d24d34640afef72f53e2c078d Mon Sep 17 00:00:00 2001 From: swayaminsync Date: Sun, 20 Jul 2025 04:14:10 +0530 Subject: [PATCH 19/20] wrapping up --- quaddtype/README.md | 8 +++++-- quaddtype/numpy_quaddtype/src/umath/matmul.h | 23 -------------------- quaddtype/release_tracker.md | 2 -- 3 files changed, 6 insertions(+), 27 deletions(-) diff --git a/quaddtype/README.md b/quaddtype/README.md index af4ddef7..11eef1d2 100644 --- a/quaddtype/README.md +++ b/quaddtype/README.md @@ -27,7 +27,7 @@ np.array([1,2,3], dtype=QuadPrecDType("longdouble")) The code needs the quad precision pieces of the sleef library, which is not available on most systems by default, so we have to generate -that first. The below assumes one has the required pieces to build +that first. The below assumes one has the required pieces to build sleef (cmake and libmpfr-dev), and that one is in the package directory locally. @@ -40,6 +40,7 @@ cd .. ``` Building the `numpy-quaddtype` package from locally installed sleef: + ```bash export SLEEF_DIR=$PWD/sleef/build export LIBRARY_PATH=$SLEEF_DIR/lib @@ -57,10 +58,13 @@ export LDFLAGS="-Wl,-rpath,$SLEEF_DIR/lib -fopenmp -latomic -lpthread" export CFLAGS="-fPIC" export CXXFLAGS="-fPIC" +# To build without QBLAS (default for MSVC) +# export CFLAGS="-fPIC -DDISABLE_QUADBLAS" +# export CXXFLAGS="-fPIC -DDISABLE_QUADBLAS" + python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' # Run the tests cd .. python -m pytest ``` - diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.h b/quaddtype/numpy_quaddtype/src/umath/matmul.h index 947e2c3d..12858497 100644 --- a/quaddtype/numpy_quaddtype/src/umath/matmul.h +++ b/quaddtype/numpy_quaddtype/src/umath/matmul.h @@ -1,35 +1,12 @@ #ifndef _QUADDTYPE_MATMUL_H #define _QUADDTYPE_MATMUL_H -/** - * Quad Precision Matrix Multiplication for NumPy - * - * This module implements matrix multiplication functionality for the QuadPrecDType - * by registering custom loops with numpy's matmul generalized ufunc. - * - * Supports all matmul operation types: - * - Vector-vector (dot product): (n,) @ (n,) -> scalar - * - Matrix-vector: (m,n) @ (n,) -> (m,) - * - Vector-matrix: (n,) @ (n,p) -> (p,) - * - Matrix-matrix: (m,n) @ (n,p) -> (m,p) - * - * Uses naive algorithms optimized for correctness rather than performance. - * For production use, consider integration with QBLAS optimized routines. - */ - #include #ifdef __cplusplus extern "C" { #endif -/** - * Initialize the matmul operations for the quad precision dtype. - * This function registers the matmul generalized ufunc with numpy. - * - * @param numpy The numpy module object - * @return 0 on success, -1 on failure - */ int init_matmul_ops(PyObject *numpy); diff --git a/quaddtype/release_tracker.md b/quaddtype/release_tracker.md index 3ed10042..fbd3b205 100644 --- a/quaddtype/release_tracker.md +++ b/quaddtype/release_tracker.md @@ -91,5 +91,3 @@ | floor | ✅ | | ceil | ✅ | | trunc | ✅ | - -- Fixing QBLAS integration to work unaligned arrays without or recovering from bad allocation fallback From 8f16b9904b9360e6daa902cfd400816ce8a759ff Mon Sep 17 00:00:00 2001 From: swayaminsync Date: Sun, 20 Jul 2025 04:27:01 +0530 Subject: [PATCH 20/20] updated branch to main --- .github/workflows/build_wheels.yml | 2 +- .github/workflows/ci.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 0bf55c46..37a80387 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -3,7 +3,7 @@ name: Build Wheels on: push: branches: - - dot + - main tags: - "quaddtype-v*" paths: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e42c12c8..542093c5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,7 +3,7 @@ name: Numpy User DTypes CI on: push: branches: - - dot + - main pull_request: workflow_dispatch: