From 33b48fefb2ee2795e4b8b598a864480f5028cfdb Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 16 Jul 2025 12:04:46 +0000
Subject: [PATCH 01/20] umath refactor

---
 quaddtype/meson.build                         |  13 +-
 .../numpy_quaddtype/src/quaddtype_main.c      |   2 +-
 quaddtype/numpy_quaddtype/src/umath.cpp       | 806 ------------------
 .../numpy_quaddtype/src/umath/binary_ops.cpp  | 235 +++++
 .../numpy_quaddtype/src/umath/binary_ops.h    |   9 +
 .../src/umath/comparison_ops.cpp              | 240 ++++++
 .../src/umath/comparison_ops.h                |   9 +
 .../numpy_quaddtype/src/umath/promoters.hpp   |  90 ++
 quaddtype/numpy_quaddtype/src/umath/umath.cpp | 115 +++
 .../numpy_quaddtype/src/{ => umath}/umath.h   |   2 +-
 .../numpy_quaddtype/src/umath/unary_ops.cpp   | 214 +++++
 .../numpy_quaddtype/src/umath/unary_ops.h     |   9 +
 12 files changed, 933 insertions(+), 811 deletions(-)
 delete mode 100644 quaddtype/numpy_quaddtype/src/umath.cpp
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/binary_ops.cpp
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/binary_ops.h
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/comparison_ops.cpp
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/comparison_ops.h
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/promoters.hpp
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/umath.cpp
 rename quaddtype/numpy_quaddtype/src/{ => umath}/umath.h (95%)
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/unary_ops.cpp
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/unary_ops.h
diff --git a/quaddtype/meson.build b/quaddtype/meson.build
index d1c67990..66318d60 100644
--- a/quaddtype/meson.build
+++ b/quaddtype/meson.build
@@ -50,12 +50,19 @@ srcs = [
     'numpy_quaddtype/src/scalar_ops.h',
     'numpy_quaddtype/src/scalar_ops.cpp',
     'numpy_quaddtype/src/ops.hpp',
-    'numpy_quaddtype/src/umath.h',
-    'numpy_quaddtype/src/umath.cpp',
     'numpy_quaddtype/src/dragon4.h',
     'numpy_quaddtype/src/dragon4.c',
     'numpy_quaddtype/src/quadblas_interface.h',
-    'numpy_quaddtype/src/quadblas_interface.cpp'
+    'numpy_quaddtype/src/quadblas_interface.cpp',
+    'numpy_quaddtype/src/umath/umath.h',
+    'numpy_quaddtype/src/umath/umath.cpp',
+    'numpy_quaddtype/src/umath/binary_ops.h',
+    'numpy_quaddtype/src/umath/binary_ops.cpp',
+    'numpy_quaddtype/src/umath/unary_ops.h',
+    'numpy_quaddtype/src/umath/unary_ops.cpp',
+    'numpy_quaddtype/src/umath/comparison_ops.h',
+    'numpy_quaddtype/src/umath/comparison_ops.cpp',
+    'numpy_quaddtype/src/umath/promoters.hpp',
 ]
 
 py.install_sources(
diff --git a/quaddtype/numpy_quaddtype/src/quaddtype_main.c b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
index 9e2b8430..641200d3 100644
--- a/quaddtype/numpy_quaddtype/src/quaddtype_main.c
+++ b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
@@ -14,7 +14,7 @@
 
 #include "scalar.h"
 #include "dtype.h"
-#include "umath.h"
+#include "umath/umath.h"
 #include "quad_common.h"
 #include "quadblas_interface.h"
 #include "float.h"
diff --git a/quaddtype/numpy_quaddtype/src/umath.cpp b/quaddtype/numpy_quaddtype/src/umath.cpp
deleted file mode 100644
index 0058236a..00000000
--- a/quaddtype/numpy_quaddtype/src/umath.cpp
+++ /dev/null
@@ -1,806 +0,0 @@
-#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
-#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
-#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
-#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
-#define NO_IMPORT_ARRAY
-#define NO_IMPORT_UFUNC
-
-extern "C" {
-#include <Python.h>
-#include <cstdio>
-
-#include "numpy/arrayobject.h"
-#include "numpy/ndarraytypes.h"
-#include "numpy/ufuncobject.h"
-
-#include "numpy/dtype_api.h"
-}
-#include "quad_common.h"
-#include "scalar.h"
-#include "dtype.h"
-#include "umath.h"
-#include "ops.hpp"
-
-// helper debugging function
-static const char *
-get_dtype_name(PyArray_DTypeMeta *dtype)
-{
-    if (dtype == &QuadPrecDType) {
-        return "QuadPrecDType";
-    }
-    else if (dtype == &PyArray_BoolDType) {
-        return "BoolDType";
-    }
-    else if (dtype == &PyArray_ByteDType) {
-        return "ByteDType";
-    }
-    else if (dtype == &PyArray_UByteDType) {
-        return "UByteDType";
-    }
-    else if (dtype == &PyArray_ShortDType) {
-        return "ShortDType";
-    }
-    else if (dtype == &PyArray_UShortDType) {
-        return "UShortDType";
-    }
-    else if (dtype == &PyArray_IntDType) {
-        return "IntDType";
-    }
-    else if (dtype == &PyArray_UIntDType) {
-        return "UIntDType";
-    }
-    else if (dtype == &PyArray_LongDType) {
-        return "LongDType";
-    }
-    else if (dtype == &PyArray_ULongDType) {
-        return "ULongDType";
-    }
-    else if (dtype == &PyArray_LongLongDType) {
-        return "LongLongDType";
-    }
-    else if (dtype == &PyArray_ULongLongDType) {
-        return "ULongLongDType";
-    }
-    else if (dtype == &PyArray_FloatDType) {
-        return "FloatDType";
-    }
-    else if (dtype == &PyArray_DoubleDType) {
-        return "DoubleDType";
-    }
-    else if (dtype == &PyArray_LongDoubleDType) {
-        return "LongDoubleDType";
-    }
-    else {
-        return "UnknownDType";
-    }
-}
-
-static NPY_CASTING
-quad_unary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
-                                  PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
-                                  npy_intp *NPY_UNUSED(view_offset))
-{
-    Py_INCREF(given_descrs[0]);
-    loop_descrs[0] = given_descrs[0];
-
-    if (given_descrs[1] == NULL) {
-        Py_INCREF(given_descrs[0]);
-        loop_descrs[1] = given_descrs[0];
-    }
-    else {
-        Py_INCREF(given_descrs[1]);
-        loop_descrs[1] = given_descrs[1];
-    }
-
-    QuadPrecDTypeObject *descr_in = (QuadPrecDTypeObject *)given_descrs[0];
-    QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)loop_descrs[1];
-
-    if (descr_in->backend != descr_out->backend) {
-        return NPY_UNSAFE_CASTING;
-    }
-
-    return NPY_NO_CASTING;
-}
-
-template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
-int
-quad_generic_unary_op_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
-                                             npy_intp const dimensions[], npy_intp const strides[],
-                                             NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in_ptr = data[0];
-    char *out_ptr = data[1];
-    npy_intp in_stride = strides[0];
-    npy_intp out_stride = strides[1];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
-
-    quad_value in, out;
-    while (N--) {
-        memcpy(&in, in_ptr, elem_size);
-        if (backend == BACKEND_SLEEF) {
-            out.sleef_value = sleef_op(&in.sleef_value);
-        }
-        else {
-            out.longdouble_value = longdouble_op(&in.longdouble_value);
-        }
-        memcpy(out_ptr, &out, elem_size);
-
-        in_ptr += in_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
-int
-quad_generic_unary_op_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
-                                           npy_intp const dimensions[], npy_intp const strides[],
-                                           NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in_ptr = data[0];
-    char *out_ptr = data[1];
-    npy_intp in_stride = strides[0];
-    npy_intp out_stride = strides[1];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-
-    while (N--) {
-        if (backend == BACKEND_SLEEF) {
-            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in_ptr);
-        }
-        else {
-            *(long double *)out_ptr = longdouble_op((long double *)in_ptr);
-        }
-        in_ptr += in_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
-int
-create_quad_unary_ufunc(PyObject *numpy, const char *ufunc_name)
-{
-    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
-    if (ufunc == NULL) {
-        return -1;
-    }
-
-    PyArray_DTypeMeta *dtypes[2] = {&QuadPrecDType, &QuadPrecDType};
-
-    PyType_Slot slots[] = {
-            {NPY_METH_resolve_descriptors, (void *)&quad_unary_op_resolve_descriptors},
-            {NPY_METH_strided_loop,
-             (void *)&quad_generic_unary_op_strided_loop_aligned<sleef_op, longdouble_op>},
-            {NPY_METH_unaligned_strided_loop,
-             (void *)&quad_generic_unary_op_strided_loop_unaligned<sleef_op, longdouble_op>},
-            {0, NULL}};
-
-    PyArrayMethod_Spec Spec = {
-            .name = "quad_unary_op",
-            .nin = 1,
-            .nout = 1,
-            .casting = NPY_NO_CASTING,
-            .flags = NPY_METH_SUPPORTS_UNALIGNED,
-            .dtypes = dtypes,
-            .slots = slots,
-    };
-
-    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
-        return -1;
-    }
-
-    return 0;
-}
-
-int
-init_quad_unary_ops(PyObject *numpy)
-{
-    if (create_quad_unary_ufunc<quad_negative, ld_negative>(numpy, "negative") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_positive, ld_positive>(numpy, "positive") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_absolute, ld_absolute>(numpy, "absolute") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_rint, ld_rint>(numpy, "rint") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_trunc, ld_trunc>(numpy, "trunc") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_floor, ld_floor>(numpy, "floor") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_ceil, ld_ceil>(numpy, "ceil") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_sqrt, ld_sqrt>(numpy, "sqrt") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_square, ld_square>(numpy, "square") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_log, ld_log>(numpy, "log") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_log2, ld_log2>(numpy, "log2") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_log10, ld_log10>(numpy, "log10") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_log1p, ld_log1p>(numpy, "log1p") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_exp, ld_exp>(numpy, "exp") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_exp2, ld_exp2>(numpy, "exp2") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_sin, ld_sin>(numpy, "sin") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_cos, ld_cos>(numpy, "cos") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_tan, ld_tan>(numpy, "tan") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_asin, ld_asin>(numpy, "arcsin") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_acos, ld_acos>(numpy, "arccos") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_atan, ld_atan>(numpy, "arctan") < 0) {
-        return -1;
-    }
-    return 0;
-}
-
-// Binary ufuncs
-
-static NPY_CASTING
-quad_binary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
-                                   PyArray_Descr *const given_descrs[],
-                                   PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset))
-{
-    QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
-    QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
-    QuadBackendType target_backend;
-
-    // Determine target backend and if casting is needed
-    NPY_CASTING casting = NPY_NO_CASTING;
-    if (descr_in1->backend != descr_in2->backend) {
-        target_backend = BACKEND_LONGDOUBLE;
-        casting = NPY_SAFE_CASTING;
-    }
-    else {
-        target_backend = descr_in1->backend;
-    }
-
-    // Set up input descriptors, casting if necessary
-    for (int i = 0; i < 2; i++) {
-        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
-            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
-            if (!loop_descrs[i]) {
-                return (NPY_CASTING)-1;
-            }
-        }
-        else {
-            Py_INCREF(given_descrs[i]);
-            loop_descrs[i] = given_descrs[i];
-        }
-    }
-
-    // Set up output descriptor
-    if (given_descrs[2] == NULL) {
-        loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
-        if (!loop_descrs[2]) {
-            return (NPY_CASTING)-1;
-        }
-    }
-    else {
-        QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2];
-        if (descr_out->backend != target_backend) {
-            loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
-            if (!loop_descrs[2]) {
-                return (NPY_CASTING)-1;
-            }
-        }
-        else {
-            Py_INCREF(given_descrs[2]);
-            loop_descrs[2] = given_descrs[2];
-        }
-    }
-    return casting;
-}
-
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
-int
-quad_generic_binop_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
-                                          npy_intp const dimensions[], npy_intp const strides[],
-                                          NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
-
-    quad_value in1, in2, out;
-    while (N--) {
-        memcpy(&in1, in1_ptr, elem_size);
-        memcpy(&in2, in2_ptr, elem_size);
-        if (backend == BACKEND_SLEEF) {
-            out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value);
-        }
-        else {
-            out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value);
-        }
-        memcpy(out_ptr, &out, elem_size);
-
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
-int
-quad_generic_binop_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
-                                        npy_intp const dimensions[], npy_intp const strides[],
-                                        NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-
-    while (N--) {
-        if (backend == BACKEND_SLEEF) {
-            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr);
-        }
-        else {
-            *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr);
-        }
-
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-static int
-quad_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[],
-                    PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[])
-{
-    int nin = ufunc->nin;
-    int nargs = ufunc->nargs;
-    PyArray_DTypeMeta *common = NULL;
-    bool has_quad = false;
-
-    // Handle the special case for reductions
-    if (op_dtypes[0] == NULL) {
-        assert(nin == 2 && ufunc->nout == 1); /* must be reduction */
-        for (int i = 0; i < 3; i++) {
-            Py_INCREF(op_dtypes[1]);
-            new_op_dtypes[i] = op_dtypes[1];
-        }
-        return 0;
-    }
-
-    // Check if any input or signature is QuadPrecision
-    for (int i = 0; i < nin; i++) {
-        if (op_dtypes[i] == &QuadPrecDType) {
-            has_quad = true;
-        }
-    }
-
-    if (has_quad) {
-        common = &QuadPrecDType;
-    }
-    else {
-        for (int i = nin; i < nargs; i++) {
-            if (signature[i] != NULL) {
-                if (common == NULL) {
-                    Py_INCREF(signature[i]);
-                    common = signature[i];
-                }
-                else if (common != signature[i]) {
-                    Py_CLEAR(common);  // Not homogeneous, unset common
-                    break;
-                }
-            }
-        }
-    }
-    // If no common output dtype, use standard promotion for inputs
-    if (common == NULL) {
-        common = PyArray_PromoteDTypeSequence(nin, op_dtypes);
-        if (common == NULL) {
-            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
-                PyErr_Clear();  // Do not propagate normal promotion errors
-            }
-
-            return -1;
-        }
-    }
-
-    // Set all new_op_dtypes to the common dtype
-    for (int i = 0; i < nargs; i++) {
-        if (signature[i]) {
-            // If signature is specified for this argument, use it
-            Py_INCREF(signature[i]);
-            new_op_dtypes[i] = signature[i];
-        }
-        else {
-            // Otherwise, use the common dtype
-            Py_INCREF(common);
-
-            new_op_dtypes[i] = common;
-        }
-    }
-
-    Py_XDECREF(common);
-
-    return 0;
-}
-
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
-int
-create_quad_binary_ufunc(PyObject *numpy, const char *ufunc_name)
-{
-    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
-    if (ufunc == NULL) {
-        return -1;
-    }
-
-    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
-
-    PyType_Slot slots[] = {
-            {NPY_METH_resolve_descriptors, (void *)&quad_binary_op_resolve_descriptors},
-            {NPY_METH_strided_loop,
-             (void *)&quad_generic_binop_strided_loop_aligned<sleef_op, longdouble_op>},
-            {NPY_METH_unaligned_strided_loop,
-             (void *)&quad_generic_binop_strided_loop_unaligned<sleef_op, longdouble_op>},
-            {0, NULL}};
-
-    PyArrayMethod_Spec Spec = {
-            .name = "quad_binop",
-            .nin = 2,
-            .nout = 1,
-            .casting = NPY_NO_CASTING,
-            .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE),
-            .dtypes = dtypes,
-            .slots = slots,
-    };
-
-    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
-        return -1;
-    }
-
-    PyObject *promoter_capsule =
-            PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL);
-    if (promoter_capsule == NULL) {
-        return -1;
-    }
-
-    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type);
-    if (DTypes == 0) {
-        Py_DECREF(promoter_capsule);
-        return -1;
-    }
-
-    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
-        Py_DECREF(promoter_capsule);
-        Py_DECREF(DTypes);
-        return -1;
-    }
-    Py_DECREF(promoter_capsule);
-    Py_DECREF(DTypes);
-    return 0;
-}
-
-int
-init_quad_binary_ops(PyObject *numpy)
-{
-    if (create_quad_binary_ufunc<quad_add, ld_add>(numpy, "add") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_sub, ld_sub>(numpy, "subtract") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_mul, ld_mul>(numpy, "multiply") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_div, ld_div>(numpy, "divide") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_pow, ld_pow>(numpy, "power") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_mod, ld_mod>(numpy, "mod") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_minimum, ld_minimum>(numpy, "minimum") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_maximum, ld_maximum>(numpy, "maximum") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_atan2, ld_atan2>(numpy, "arctan2") < 0) {
-        return -1;
-    }
-    return 0;
-}
-
-// comparison functions
-
-static NPY_CASTING
-quad_comparison_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
-                                       PyArray_Descr *const given_descrs[],
-                                       PyArray_Descr *loop_descrs[],
-                                       npy_intp *NPY_UNUSED(view_offset))
-{
-    QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
-    QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
-    QuadBackendType target_backend;
-
-    // As dealing with different backends then cast to boolean
-    NPY_CASTING casting = NPY_NO_CASTING;
-    if (descr_in1->backend != descr_in2->backend) {
-        target_backend = BACKEND_LONGDOUBLE;
-        casting = NPY_SAFE_CASTING;
-    }
-    else {
-        target_backend = descr_in1->backend;
-    }
-
-    // Set up input descriptors, casting if necessary
-    for (int i = 0; i < 2; i++) {
-        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
-            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
-            if (!loop_descrs[i]) {
-                return (NPY_CASTING)-1;
-            }
-        }
-        else {
-            Py_INCREF(given_descrs[i]);
-            loop_descrs[i] = given_descrs[i];
-        }
-    }
-
-    // Set up output descriptor
-    loop_descrs[2] = PyArray_DescrFromType(NPY_BOOL);
-    if (!loop_descrs[2]) {
-        return (NPY_CASTING)-1;
-    }
-    return casting;
-}
-
-template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
-int
-quad_generic_comp_strided_loop(PyArrayMethod_Context *context, char *const data[],
-                               npy_intp const dimensions[], npy_intp const strides[],
-                               NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
-
-    quad_value in1, in2;
-    while (N--) {
-        memcpy(&in1, in1_ptr, elem_size);
-        memcpy(&in2, in2_ptr, elem_size);
-        npy_bool result;
-
-        if (backend == BACKEND_SLEEF) {
-            result = sleef_comp(&in1.sleef_value, &in2.sleef_value);
-        }
-        else {
-            result = ld_comp(&in1.longdouble_value, &in2.longdouble_value);
-        }
-
-        memcpy(out_ptr, &result, sizeof(npy_bool));
-
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
-int
-quad_generic_comp_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
-                                       npy_intp const dimensions[], npy_intp const strides[],
-                                       NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-    while (N--) {
-        quad_value in1 = *(quad_value *)in1_ptr;
-        quad_value in2 = *(quad_value *)in2_ptr;
-
-        npy_bool result;
-
-        if (backend == BACKEND_SLEEF) {
-            result = sleef_comp(&in1.sleef_value, &in2.sleef_value);
-        }
-        else {
-            result = ld_comp(&in1.longdouble_value, &in2.longdouble_value);
-        }
-
-        *(npy_bool *)out_ptr = result;
-
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-NPY_NO_EXPORT int
-comparison_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[],
-                          PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[])
-{
-    PyArray_DTypeMeta *new_signature[NPY_MAXARGS];
-    memcpy(new_signature, signature, 3 * sizeof(PyArray_DTypeMeta *));
-    new_signature[2] = NULL;
-    int res = quad_ufunc_promoter(ufunc, op_dtypes, new_signature, new_op_dtypes);
-    if (res < 0) {
-        return -1;
-    }
-    Py_XSETREF(new_op_dtypes[2], &PyArray_BoolDType);
-    return 0;
-}
-
-template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
-int
-create_quad_comparison_ufunc(PyObject *numpy, const char *ufunc_name)
-{
-    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
-    if (ufunc == NULL) {
-        return -1;
-    }
-
-    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &PyArray_BoolDType};
-
-    PyType_Slot slots[] = {
-            {NPY_METH_resolve_descriptors, (void *)&quad_comparison_op_resolve_descriptors},
-            {NPY_METH_strided_loop,
-             (void *)&quad_generic_comp_strided_loop_aligned<sleef_comp, ld_comp>},
-            {NPY_METH_unaligned_strided_loop,
-             (void *)&quad_generic_comp_strided_loop<sleef_comp, ld_comp>},
-            {0, NULL}};
-
-    PyArrayMethod_Spec Spec = {
-            .name = "quad_comp",
-            .nin = 2,
-            .nout = 1,
-            .casting = NPY_SAFE_CASTING,
-            .flags = NPY_METH_SUPPORTS_UNALIGNED,
-            .dtypes = dtypes,
-            .slots = slots,
-    };
-
-    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
-        return -1;
-    }
-
-    PyObject *promoter_capsule =
-            PyCapsule_New((void *)&comparison_ufunc_promoter, "numpy._ufunc_promoter", NULL);
-    if (promoter_capsule == NULL) {
-        return -1;
-    }
-
-    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArray_BoolDType);
-    if (DTypes == 0) {
-        Py_DECREF(promoter_capsule);
-        return -1;
-    }
-
-    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
-        Py_DECREF(promoter_capsule);
-        Py_DECREF(DTypes);
-        return -1;
-    }
-    Py_DECREF(promoter_capsule);
-    Py_DECREF(DTypes);
-
-    return 0;
-}
-
-int
-init_quad_comps(PyObject *numpy)
-{
-    if (create_quad_comparison_ufunc<quad_equal, ld_equal>(numpy, "equal") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_notequal, ld_notequal>(numpy, "not_equal") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_less, ld_less>(numpy, "less") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_lessequal, ld_lessequal>(numpy, "less_equal") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_greater, ld_greater>(numpy, "greater") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_greaterequal, ld_greaterequal>(numpy, "greater_equal") <
-        0) {
-        return -1;
-    }
-
-    return 0;
-}
-
-int
-init_quad_umath(void)
-{
-    PyObject *numpy = PyImport_ImportModule("numpy");
-    if (!numpy) {
-        PyErr_SetString(PyExc_ImportError, "Failed to import numpy module");
-        return -1;
-    }
-
-    if (init_quad_unary_ops(numpy) < 0) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad unary operations");
-        goto err;
-    }
-
-    if (init_quad_binary_ops(numpy) < 0) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad binary operations");
-        goto err;
-    }
-
-    if (init_quad_comps(numpy) < 0) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad comparison operations");
-        goto err;
-    }
-
-    Py_DECREF(numpy);
-    return 0;
-
-err:
-    Py_DECREF(numpy);
-    return -1;
-}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/binary_ops.cpp b/quaddtype/numpy_quaddtype/src/umath/binary_ops.cpp
new file mode 100644
index 00000000..aa6d19ce
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/binary_ops.cpp
@@ -0,0 +1,235 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+
+#include <Python.h>
+#include <cstdio>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/dtype_api.h"
+#include "numpy/ndarraytypes.h"
+
+#include "../quad_common.h"
+#include "../scalar.h"
+#include "../dtype.h"
+#include "../ops.hpp"
+#include "promoters.hpp"
+#include "binary_ops.h"
+
+static NPY_CASTING
+quad_binary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
+                                   PyArray_Descr *const given_descrs[],
+                                   PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset))
+{
+    QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
+    QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
+    QuadBackendType target_backend;
+
+    // Determine target backend and if casting is needed
+    NPY_CASTING casting = NPY_NO_CASTING;
+    if (descr_in1->backend != descr_in2->backend) {
+        target_backend = BACKEND_LONGDOUBLE;
+        casting = NPY_SAFE_CASTING;
+    }
+    else {
+        target_backend = descr_in1->backend;
+    }
+
+    // Set up input descriptors, casting if necessary
+    for (int i = 0; i < 2; i++) {
+        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
+            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[i]) {
+                return (NPY_CASTING)-1;
+            }
+        }
+        else {
+            Py_INCREF(given_descrs[i]);
+            loop_descrs[i] = given_descrs[i];
+        }
+    }
+
+    // Set up output descriptor
+    if (given_descrs[2] == NULL) {
+        loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+        if (!loop_descrs[2]) {
+            return (NPY_CASTING)-1;
+        }
+    }
+    else {
+        QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2];
+        if (descr_out->backend != target_backend) {
+            loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[2]) {
+                return (NPY_CASTING)-1;
+            }
+        }
+        else {
+            Py_INCREF(given_descrs[2]);
+            loop_descrs[2] = given_descrs[2];
+        }
+    }
+    return casting;
+}
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+int
+quad_generic_binop_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                          npy_intp const dimensions[], npy_intp const strides[],
+                                          NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    quad_value in1, in2, out;
+    while (N--) {
+        memcpy(&in1, in1_ptr, elem_size);
+        memcpy(&in2, in2_ptr, elem_size);
+        if (backend == BACKEND_SLEEF) {
+            out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value);
+        }
+        else {
+            out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value);
+        }
+        memcpy(out_ptr, &out, elem_size);
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+int
+quad_generic_binop_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                        npy_intp const dimensions[], npy_intp const strides[],
+                                        NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+
+    while (N--) {
+        if (backend == BACKEND_SLEEF) {
+            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr);
+        }
+        else {
+            *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr);
+        }
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+int
+create_quad_binary_ufunc(PyObject *numpy, const char *ufunc_name)
+{
+    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+
+    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
+
+    PyType_Slot slots[] = {
+            {NPY_METH_resolve_descriptors, (void *)&quad_binary_op_resolve_descriptors},
+            {NPY_METH_strided_loop,
+             (void *)&quad_generic_binop_strided_loop_aligned<sleef_op, longdouble_op>},
+            {NPY_METH_unaligned_strided_loop,
+             (void *)&quad_generic_binop_strided_loop_unaligned<sleef_op, longdouble_op>},
+            {0, NULL}};
+
+    PyArrayMethod_Spec Spec = {
+            .name = "quad_binop",
+            .nin = 2,
+            .nout = 1,
+            .casting = NPY_NO_CASTING,
+            .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE),
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        return -1;
+    }
+
+    PyObject *promoter_capsule =
+            PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL);
+    if (promoter_capsule == NULL) {
+        return -1;
+    }
+
+    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type);
+    if (DTypes == 0) {
+        Py_DECREF(promoter_capsule);
+        return -1;
+    }
+
+    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
+        Py_DECREF(promoter_capsule);
+        Py_DECREF(DTypes);
+        return -1;
+    }
+    Py_DECREF(promoter_capsule);
+    Py_DECREF(DTypes);
+    return 0;
+}
+
+
+int
+init_quad_binary_ops(PyObject *numpy)
+{
+    if (create_quad_binary_ufunc<quad_add, ld_add>(numpy, "add") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_sub, ld_sub>(numpy, "subtract") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_mul, ld_mul>(numpy, "multiply") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_div, ld_div>(numpy, "divide") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_pow, ld_pow>(numpy, "power") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_mod, ld_mod>(numpy, "mod") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_minimum, ld_minimum>(numpy, "minimum") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_maximum, ld_maximum>(numpy, "maximum") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_atan2, ld_atan2>(numpy, "arctan2") < 0) {
+        return -1;
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/binary_ops.h b/quaddtype/numpy_quaddtype/src/umath/binary_ops.h
new file mode 100644
index 00000000..c45ec453
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/binary_ops.h
@@ -0,0 +1,9 @@
+#ifndef _QUADDTYPE_BINARY_OPS_H
+#define _QUADDTYPE_BINARY_OPS_H
+
+#include <Python.h>
+
+int
+init_quad_binary_ops(PyObject *numpy);
+
+#endif
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/comparison_ops.cpp b/quaddtype/numpy_quaddtype/src/umath/comparison_ops.cpp
new file mode 100644
index 00000000..095b6d39
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/comparison_ops.cpp
@@ -0,0 +1,240 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+
+#include <Python.h>
+#include <cstdio>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/dtype_api.h"
+#include "numpy/ndarraytypes.h"
+
+#include "../quad_common.h"
+#include "../scalar.h"
+#include "../dtype.h"
+#include "umath.h"
+#include "../ops.hpp"
+#include "promoters.hpp"
+#include "binary_ops.h"
+#include "comparison_ops.h"
+
+
+static NPY_CASTING
+quad_comparison_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
+                                       PyArray_Descr *const given_descrs[],
+                                       PyArray_Descr *loop_descrs[],
+                                       npy_intp *NPY_UNUSED(view_offset))
+{
+    QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
+    QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
+    QuadBackendType target_backend;
+
+    // As dealing with different backends then cast to boolean
+    NPY_CASTING casting = NPY_NO_CASTING;
+    if (descr_in1->backend != descr_in2->backend) {
+        target_backend = BACKEND_LONGDOUBLE;
+        casting = NPY_SAFE_CASTING;
+    }
+    else {
+        target_backend = descr_in1->backend;
+    }
+
+    // Set up input descriptors, casting if necessary
+    for (int i = 0; i < 2; i++) {
+        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
+            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[i]) {
+                return (NPY_CASTING)-1;
+            }
+        }
+        else {
+            Py_INCREF(given_descrs[i]);
+            loop_descrs[i] = given_descrs[i];
+        }
+    }
+
+    // Set up output descriptor
+    loop_descrs[2] = PyArray_DescrFromType(NPY_BOOL);
+    if (!loop_descrs[2]) {
+        return (NPY_CASTING)-1;
+    }
+    return casting;
+}
+
+template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
+int
+quad_generic_comp_strided_loop(PyArrayMethod_Context *context, char *const data[],
+                               npy_intp const dimensions[], npy_intp const strides[],
+                               NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    quad_value in1, in2;
+    while (N--) {
+        memcpy(&in1, in1_ptr, elem_size);
+        memcpy(&in2, in2_ptr, elem_size);
+        npy_bool result;
+
+        if (backend == BACKEND_SLEEF) {
+            result = sleef_comp(&in1.sleef_value, &in2.sleef_value);
+        }
+        else {
+            result = ld_comp(&in1.longdouble_value, &in2.longdouble_value);
+        }
+
+        memcpy(out_ptr, &result, sizeof(npy_bool));
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
+int
+quad_generic_comp_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                       npy_intp const dimensions[], npy_intp const strides[],
+                                       NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    while (N--) {
+        quad_value in1 = *(quad_value *)in1_ptr;
+        quad_value in2 = *(quad_value *)in2_ptr;
+
+        npy_bool result;
+
+        if (backend == BACKEND_SLEEF) {
+            result = sleef_comp(&in1.sleef_value, &in2.sleef_value);
+        }
+        else {
+            result = ld_comp(&in1.longdouble_value, &in2.longdouble_value);
+        }
+
+        *(npy_bool *)out_ptr = result;
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+NPY_NO_EXPORT int
+comparison_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[],
+                          PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[])
+{
+    PyArray_DTypeMeta *new_signature[NPY_MAXARGS];
+    memcpy(new_signature, signature, 3 * sizeof(PyArray_DTypeMeta *));
+    new_signature[2] = NULL;
+    int res = quad_ufunc_promoter(ufunc, op_dtypes, new_signature, new_op_dtypes);
+    if (res < 0) {
+        return -1;
+    }
+    Py_XSETREF(new_op_dtypes[2], &PyArray_BoolDType);
+    return 0;
+}
+
+template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
+int
+create_quad_comparison_ufunc(PyObject *numpy, const char *ufunc_name)
+{
+    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+
+    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &PyArray_BoolDType};
+
+    PyType_Slot slots[] = {
+            {NPY_METH_resolve_descriptors, (void *)&quad_comparison_op_resolve_descriptors},
+            {NPY_METH_strided_loop,
+             (void *)&quad_generic_comp_strided_loop_aligned<sleef_comp, ld_comp>},
+            {NPY_METH_unaligned_strided_loop,
+             (void *)&quad_generic_comp_strided_loop<sleef_comp, ld_comp>},
+            {0, NULL}};
+
+    PyArrayMethod_Spec Spec = {
+            .name = "quad_comp",
+            .nin = 2,
+            .nout = 1,
+            .casting = NPY_SAFE_CASTING,
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        return -1;
+    }
+
+    PyObject *promoter_capsule =
+            PyCapsule_New((void *)&comparison_ufunc_promoter, "numpy._ufunc_promoter", NULL);
+    if (promoter_capsule == NULL) {
+        return -1;
+    }
+
+    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArray_BoolDType);
+    if (DTypes == 0) {
+        Py_DECREF(promoter_capsule);
+        return -1;
+    }
+
+    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
+        Py_DECREF(promoter_capsule);
+        Py_DECREF(DTypes);
+        return -1;
+    }
+    Py_DECREF(promoter_capsule);
+    Py_DECREF(DTypes);
+
+    return 0;
+}
+
+int
+init_quad_comps(PyObject *numpy)
+{
+    if (create_quad_comparison_ufunc<quad_equal, ld_equal>(numpy, "equal") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_notequal, ld_notequal>(numpy, "not_equal") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_less, ld_less>(numpy, "less") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_lessequal, ld_lessequal>(numpy, "less_equal") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_greater, ld_greater>(numpy, "greater") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_greaterequal, ld_greaterequal>(numpy, "greater_equal") <
+        0) {
+        return -1;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/comparison_ops.h b/quaddtype/numpy_quaddtype/src/umath/comparison_ops.h
new file mode 100644
index 00000000..e3b8cc0e
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/comparison_ops.h
@@ -0,0 +1,9 @@
+#ifndef _QUADDTYPE_COMPARISON_OPS_H
+#define _QUADDTYPE_COMPARISON_OPS_H
+
+#include <Python.h>
+
+int
+init_quad_comps(PyObject *numpy);
+
+#endif
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/promoters.hpp b/quaddtype/numpy_quaddtype/src/umath/promoters.hpp
new file mode 100644
index 00000000..3b3c1ef3
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/promoters.hpp
@@ -0,0 +1,90 @@
+#ifndef _QUADDTYPE_PROMOTERS
+#define _QUADDTYPE_PROMOTERS
+
+#include <Python.h>
+#include <cstdio>
+#include <cassert>
+#include "numpy/arrayobject.h"
+#include "numpy/ndarrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/dtype_api.h"
+
+#include "../dtype.h"
+
+inline int
+quad_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[],
+                    PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[])
+{
+    int nin = ufunc->nin;
+    int nargs = ufunc->nargs;
+    PyArray_DTypeMeta *common = NULL;
+    bool has_quad = false;
+
+    // Handle the special case for reductions
+    if (op_dtypes[0] == NULL) {
+        assert(nin == 2 && ufunc->nout == 1); /* must be reduction */
+        for (int i = 0; i < 3; i++) {
+            Py_INCREF(op_dtypes[1]);
+            new_op_dtypes[i] = op_dtypes[1];
+        }
+        return 0;
+    }
+
+    // Check if any input or signature is QuadPrecision
+    for (int i = 0; i < nin; i++) {
+        if (op_dtypes[i] == &QuadPrecDType) {
+            has_quad = true;
+        }
+    }
+
+    if (has_quad) {
+        common = &QuadPrecDType;
+    }
+    else {
+        for (int i = nin; i < nargs; i++) {
+            if (signature[i] != NULL) {
+                if (common == NULL) {
+                    Py_INCREF(signature[i]);
+                    common = signature[i];
+                }
+                else if (common != signature[i]) {
+                    Py_CLEAR(common);  // Not homogeneous, unset common
+                    break;
+                }
+            }
+        }
+    }
+    // If no common output dtype, use standard promotion for inputs
+    if (common == NULL) {
+        common = PyArray_PromoteDTypeSequence(nin, op_dtypes);
+        if (common == NULL) {
+            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+                PyErr_Clear();  // Do not propagate normal promotion errors
+            }
+
+            return -1;
+        }
+    }
+
+    // Set all new_op_dtypes to the common dtype
+    for (int i = 0; i < nargs; i++) {
+        if (signature[i]) {
+            // If signature is specified for this argument, use it
+            Py_INCREF(signature[i]);
+            new_op_dtypes[i] = signature[i];
+        }
+        else {
+            // Otherwise, use the common dtype
+            Py_INCREF(common);
+
+            new_op_dtypes[i] = common;
+        }
+    }
+
+    Py_XDECREF(common);
+
+    return 0;
+}
+
+
+#endif
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/umath.cpp b/quaddtype/numpy_quaddtype/src/umath/umath.cpp
new file mode 100644
index 00000000..2ea864e7
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/umath.cpp
@@ -0,0 +1,115 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+extern "C" {
+#include <Python.h>
+#include <cstdio>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/dtype_api.h"
+}
+#include "../quad_common.h"
+#include "../scalar.h"
+#include "../dtype.h"
+#include "umath.h"
+#include "../ops.hpp"
+#include "unary_ops.h"
+#include "binary_ops.h"
+#include "comparison_ops.h"
+
+// helper debugging function
+static const char *
+get_dtype_name(PyArray_DTypeMeta *dtype)
+{
+    if (dtype == &QuadPrecDType) {
+        return "QuadPrecDType";
+    }
+    else if (dtype == &PyArray_BoolDType) {
+        return "BoolDType";
+    }
+    else if (dtype == &PyArray_ByteDType) {
+        return "ByteDType";
+    }
+    else if (dtype == &PyArray_UByteDType) {
+        return "UByteDType";
+    }
+    else if (dtype == &PyArray_ShortDType) {
+        return "ShortDType";
+    }
+    else if (dtype == &PyArray_UShortDType) {
+        return "UShortDType";
+    }
+    else if (dtype == &PyArray_IntDType) {
+        return "IntDType";
+    }
+    else if (dtype == &PyArray_UIntDType) {
+        return "UIntDType";
+    }
+    else if (dtype == &PyArray_LongDType) {
+        return "LongDType";
+    }
+    else if (dtype == &PyArray_ULongDType) {
+        return "ULongDType";
+    }
+    else if (dtype == &PyArray_LongLongDType) {
+        return "LongLongDType";
+    }
+    else if (dtype == &PyArray_ULongLongDType) {
+        return "ULongLongDType";
+    }
+    else if (dtype == &PyArray_FloatDType) {
+        return "FloatDType";
+    }
+    else if (dtype == &PyArray_DoubleDType) {
+        return "DoubleDType";
+    }
+    else if (dtype == &PyArray_LongDoubleDType) {
+        return "LongDoubleDType";
+    }
+    else {
+        return "UnknownDType";
+    }
+}
+
+int
+init_quad_umath(void)
+{
+    PyObject *numpy = PyImport_ImportModule("numpy");
+    if (!numpy) {
+        PyErr_SetString(PyExc_ImportError, "Failed to import numpy module");
+        return -1;
+    }
+
+    if (init_quad_unary_ops(numpy) < 0) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad unary operations");
+        goto err;
+    }
+
+    if (init_quad_binary_ops(numpy) < 0) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad binary operations");
+        goto err;
+    }
+
+    if (init_quad_comps(numpy) < 0) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad comparison operations");
+        goto err;
+    }
+
+    // if (init_quad_matmul(numpy) < 0) {
+    //     PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad matrix multiplication operations");
+    //     goto err;
+    // }
+
+    Py_DECREF(numpy);
+    return 0;
+
+err:
+    Py_DECREF(numpy);
+    return -1;
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath.h b/quaddtype/numpy_quaddtype/src/umath/umath.h
similarity index 95%
rename from quaddtype/numpy_quaddtype/src/umath.h
rename to quaddtype/numpy_quaddtype/src/umath/umath.h
index d64f26be..c9253ef7 100644
--- a/quaddtype/numpy_quaddtype/src/umath.h
+++ b/quaddtype/numpy_quaddtype/src/umath/umath.h
@@ -12,4 +12,4 @@ init_quad_umath(void);
 }
 #endif
 
-#endif
+#endif
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/unary_ops.cpp b/quaddtype/numpy_quaddtype/src/umath/unary_ops.cpp
new file mode 100644
index 00000000..4c8f31f0
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/unary_ops.cpp
@@ -0,0 +1,214 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+extern "C" {
+#include <Python.h>
+#include <cstdio>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/ufuncobject.h"
+
+#include "numpy/dtype_api.h"
+}
+#include "../quad_common.h"
+#include "../scalar.h"
+#include "../dtype.h"
+#include "../ops.hpp"
+
+static NPY_CASTING
+quad_unary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
+                                  PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
+                                  npy_intp *NPY_UNUSED(view_offset))
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    if (given_descrs[1] == NULL) {
+        Py_INCREF(given_descrs[0]);
+        loop_descrs[1] = given_descrs[0];
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    QuadPrecDTypeObject *descr_in = (QuadPrecDTypeObject *)given_descrs[0];
+    QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)loop_descrs[1];
+
+    if (descr_in->backend != descr_out->backend) {
+        return NPY_UNSAFE_CASTING;
+    }
+
+    return NPY_NO_CASTING;
+}
+
+template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
+int
+quad_generic_unary_op_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                             npy_intp const dimensions[], npy_intp const strides[],
+                                             NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in_ptr = data[0];
+    char *out_ptr = data[1];
+    npy_intp in_stride = strides[0];
+    npy_intp out_stride = strides[1];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    quad_value in, out;
+    while (N--) {
+        memcpy(&in, in_ptr, elem_size);
+        if (backend == BACKEND_SLEEF) {
+            out.sleef_value = sleef_op(&in.sleef_value);
+        }
+        else {
+            out.longdouble_value = longdouble_op(&in.longdouble_value);
+        }
+        memcpy(out_ptr, &out, elem_size);
+
+        in_ptr += in_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
+int
+quad_generic_unary_op_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                           npy_intp const dimensions[], npy_intp const strides[],
+                                           NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in_ptr = data[0];
+    char *out_ptr = data[1];
+    npy_intp in_stride = strides[0];
+    npy_intp out_stride = strides[1];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+
+    while (N--) {
+        if (backend == BACKEND_SLEEF) {
+            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in_ptr);
+        }
+        else {
+            *(long double *)out_ptr = longdouble_op((long double *)in_ptr);
+        }
+        in_ptr += in_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
+int
+create_quad_unary_ufunc(PyObject *numpy, const char *ufunc_name)
+{
+    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+
+    PyArray_DTypeMeta *dtypes[2] = {&QuadPrecDType, &QuadPrecDType};
+
+    PyType_Slot slots[] = {
+            {NPY_METH_resolve_descriptors, (void *)&quad_unary_op_resolve_descriptors},
+            {NPY_METH_strided_loop,
+             (void *)&quad_generic_unary_op_strided_loop_aligned<sleef_op, longdouble_op>},
+            {NPY_METH_unaligned_strided_loop,
+             (void *)&quad_generic_unary_op_strided_loop_unaligned<sleef_op, longdouble_op>},
+            {0, NULL}};
+
+    PyArrayMethod_Spec Spec = {
+            .name = "quad_unary_op",
+            .nin = 1,
+            .nout = 1,
+            .casting = NPY_NO_CASTING,
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+int
+init_quad_unary_ops(PyObject *numpy)
+{
+    if (create_quad_unary_ufunc<quad_negative, ld_negative>(numpy, "negative") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_positive, ld_positive>(numpy, "positive") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_absolute, ld_absolute>(numpy, "absolute") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_rint, ld_rint>(numpy, "rint") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_trunc, ld_trunc>(numpy, "trunc") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_floor, ld_floor>(numpy, "floor") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_ceil, ld_ceil>(numpy, "ceil") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_sqrt, ld_sqrt>(numpy, "sqrt") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_square, ld_square>(numpy, "square") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_log, ld_log>(numpy, "log") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_log2, ld_log2>(numpy, "log2") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_log10, ld_log10>(numpy, "log10") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_log1p, ld_log1p>(numpy, "log1p") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_exp, ld_exp>(numpy, "exp") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_exp2, ld_exp2>(numpy, "exp2") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_sin, ld_sin>(numpy, "sin") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_cos, ld_cos>(numpy, "cos") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_tan, ld_tan>(numpy, "tan") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_asin, ld_asin>(numpy, "arcsin") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_acos, ld_acos>(numpy, "arccos") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_atan, ld_atan>(numpy, "arctan") < 0) {
+        return -1;
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/unary_ops.h b/quaddtype/numpy_quaddtype/src/umath/unary_ops.h
new file mode 100644
index 00000000..6c3f17a4
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/unary_ops.h
@@ -0,0 +1,9 @@
+#ifndef _QUADDTYPE_UNARY_OPS_H
+#define _QUADDTYPE_UNARY_OPS_H
+
+#include <Python.h>
+
+int
+init_quad_unary_ops(PyObject *numpy);
+
+#endif

From a35abce6cdcfbcdf9b660c41b21c5ce9be9c8c15 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 16 Jul 2025 12:13:56 +0000
Subject: [PATCH 02/20] updaing ci

---
 .github/workflows/build_wheels.yml |  2 +-
 .github/workflows/ci.yml           | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 37a80387..e16e2ff4 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -3,7 +3,7 @@ name: Build Wheels
 on:
   push:
     branches:
-      - main
+      - matmul-ufunc
     tags:
       - "quaddtype-v*"
     paths:
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9267d9f4..0b4f2199 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,7 +3,7 @@ name: Numpy User DTypes CI
 on:
   push:
     branches:
-      - main
+      - matmul-ufunc
   pull_request:
   workflow_dispatch:
 
@@ -61,15 +61,21 @@ jobs:
           sudo apt-get install -y libmpfr-dev libssl-dev libfftw3-dev
       - name: Install SLEEF
         run: |
+          yum update -y
+          yum install -y cmake gcc gcc-c++ make git pkgconfig
           git clone --branch 3.8 https://github.com/shibatch/sleef.git
           cd sleef
           cmake -S . -B build -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
           cmake --build build/ --clean-first -j
-          sudo cmake --install build --prefix /usr
+          sudo cmake --install build --prefix /usr/local
       - name: Install quaddtype
         working-directory: quaddtype
         run: |
-          LDFLAGS="-Wl,-rpath,/usr/lib" python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug"
+            CFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CFLAGS"
+            CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -fext-numeric-literals $CXXFLAGS"
+            LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp $LDFLAGS"
+            LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
+            python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug"
       - name: Run quaddtype tests
         working-directory: quaddtype
         run: |

From 851654472afaff41c5b4a56701c1ea8de644b522 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 16 Jul 2025 12:16:54 +0000
Subject: [PATCH 03/20] switching to apt

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0b4f2199..7a1b7e9c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -61,8 +61,8 @@ jobs:
           sudo apt-get install -y libmpfr-dev libssl-dev libfftw3-dev
       - name: Install SLEEF
         run: |
-          yum update -y
-          yum install -y cmake gcc gcc-c++ make git pkgconfig
+          sudo apt-get update -y
+          sudo apt-get install -y cmake gcc g++ make git pkg-config
           git clone --branch 3.8 https://github.com/shibatch/sleef.git
           cd sleef
           cmake -S . -B build -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON

From 335f42534d2ec92130cb03a5d5dfa1916eab5045 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 16 Jul 2025 12:22:01 +0000
Subject: [PATCH 04/20] submodule fix

---
 .github/workflows/ci.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7a1b7e9c..d2aa43fb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -75,6 +75,9 @@ jobs:
             CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -fext-numeric-literals $CXXFLAGS"
             LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp $LDFLAGS"
             LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
+            
+            git submodule update --init --recursive
+            ls -la quaddtype/numpy_quaddtype/QBLAS/
             python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug"
       - name: Run quaddtype tests
         working-directory: quaddtype

From 85e7840aab3737622203d0b62100df69d05b175e Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 16 Jul 2025 12:26:37 +0000
Subject: [PATCH 05/20] submodule fix

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d2aa43fb..ff64875f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -77,7 +77,7 @@ jobs:
             LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
             
             git submodule update --init --recursive
-            ls -la quaddtype/numpy_quaddtype/QBLAS/
+            ls -la numpy_quaddtype/QBLAS/
             python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug"
       - name: Run quaddtype tests
         working-directory: quaddtype

From e467f4b01c09c45d5bd0f3cefc80cc64c45b4772 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 16 Jul 2025 12:32:42 +0000
Subject: [PATCH 06/20] submodule fix

---
 .github/workflows/ci.yml | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ff64875f..a70e8c10 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -59,6 +59,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install -y libmpfr-dev libssl-dev libfftw3-dev
+
       - name: Install SLEEF
         run: |
           sudo apt-get update -y
@@ -68,17 +69,27 @@ jobs:
           cmake -S . -B build -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
           cmake --build build/ --clean-first -j
           sudo cmake --install build --prefix /usr/local
+
       - name: Install quaddtype
         working-directory: quaddtype
         run: |
-            CFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CFLAGS"
-            CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -fext-numeric-literals $CXXFLAGS"
-            LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp $LDFLAGS"
-            LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
-            
-            git submodule update --init --recursive
-            ls -la numpy_quaddtype/QBLAS/
-            python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug"
+          # Initialize submodules first
+          git submodule update --init --recursive
+          ls -la numpy_quaddtype/QBLAS/
+          
+          # Set environment variables with proper export and correct paths
+          export CFLAGS="-I/usr/local/include -I$(pwd)/numpy_quaddtype/QBLAS/include"
+          export CXXFLAGS="-I/usr/local/include -I$(pwd)/numpy_quaddtype/QBLAS/include -fext-numeric-literals"
+          export LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp"
+          export LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
+          
+          # Install with meson args to ensure the C++ flags are passed through
+          python -m pip install . -v --no-build-isolation \
+            -Cbuilddir=build \
+            -C'compile-args=-v' \
+            -Csetup-args="-Dbuildtype=debug" \
+            -Csetup-args="-Dcpp_args=-fext-numeric-literals"
+
       - name: Run quaddtype tests
         working-directory: quaddtype
         run: |

From e201b90f7d5f299ab41cf04d92d9045d5da9ae21 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Thu, 17 Jul 2025 07:16:29 +0000
Subject: [PATCH 07/20] initial matmul ufunc setup

---
 quaddtype/meson.build                         |   2 +
 .../src/quadblas_interface.cpp                |  91 +++++++++++
 .../numpy_quaddtype/src/umath/matmul.cpp      | 148 ++++++++++++++++++
 quaddtype/numpy_quaddtype/src/umath/matmul.h  |   8 +
 quaddtype/numpy_quaddtype/src/umath/umath.cpp |   9 +-
 5 files changed, 254 insertions(+), 4 deletions(-)
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/matmul.cpp
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/matmul.h

diff --git a/quaddtype/meson.build b/quaddtype/meson.build
index 66318d60..c000675a 100644
--- a/quaddtype/meson.build
+++ b/quaddtype/meson.build
@@ -63,6 +63,8 @@ srcs = [
     'numpy_quaddtype/src/umath/comparison_ops.h',
     'numpy_quaddtype/src/umath/comparison_ops.cpp',
     'numpy_quaddtype/src/umath/promoters.hpp',
+    'numpy_quaddtype/src/umath/matmul.h',
+    'numpy_quaddtype/src/umath/matmul.cpp',
 ]
 
 py.install_sources(
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index 6bc3fb0d..cce39c39 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -786,4 +786,95 @@ py_quadblas_get_version(PyObject *self, PyObject *args)
     return PyUnicode_FromString(QuadBLAS::VERSION);
 }
 
+void matmul_op(Sleef_quad * inp1, Sleef_quad *inp2, Sleef_quad *out)
+{
+    Sleef_quad *data_a, *data_b;
+    QuadBackendType backend_a, backend_b;
+    QuadBLAS::Layout layout_a, layout_b;
+
+    if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
+        !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
+        return nullptr;
+    }
+
+    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
+    Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
+    Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
+
+    if (!sleef_a || !sleef_b) {
+        QuadBLAS::aligned_free(temp_a);
+        QuadBLAS::aligned_free(temp_b);
+        return nullptr;
+    }
+
+    QuadBackendType result_backend = BACKEND_SLEEF;
+    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
+        result_backend = BACKEND_LONGDOUBLE;
+    }
+
+    npy_intp result_dims[2] = {m, n};
+    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
+    if (!result_dtype) {
+        QuadBLAS::aligned_free(temp_a);
+        QuadBLAS::aligned_free(temp_b);
+        return nullptr;
+    }
+
+    PyArrayObject *result =
+            (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
+    if (!result) {
+        QuadBLAS::aligned_free(temp_a);
+        QuadBLAS::aligned_free(temp_b);
+        Py_DECREF(result_dtype);
+        return nullptr;
+    }
+
+    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
+    for (npy_intp i = 0; i < m * n; i++) {
+        result_data[i] = Sleef_cast_from_doubleq1(0.0);
+    }
+
+    npy_intp lda, ldb, ldc;
+
+    if (layout_a == QuadBLAS::Layout::RowMajor) {
+        lda = k;
+    }
+    else {
+        lda = m;
+    }
+
+    if (layout_b == QuadBLAS::Layout::RowMajor) {
+        ldb = n;
+    }
+    else {
+        ldb = k;
+    }
+
+    QuadBLAS::Layout result_layout = layout_a;
+    if (result_layout == QuadBLAS::Layout::RowMajor) {
+        ldc = n;
+    }
+    else {
+        ldc = m;
+    }
+
+    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
+    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
+
+    QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data,
+                   ldc);
+
+    if (result_backend == BACKEND_LONGDOUBLE) {
+        long double *ld_result = (long double *)PyArray_DATA(result);
+        for (npy_intp i = 0; i < m * n; i++) {
+            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+        }
+    }
+
+    QuadBLAS::aligned_free(temp_a);
+    QuadBLAS::aligned_free(temp_b);
+
+    return (PyObject *)result;
+}
+
 #endif  // DISABLE_QUADBLAS
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
new file mode 100644
index 00000000..47fdf5f5
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -0,0 +1,148 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+
+#include <Python.h>
+#include <cstdio>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/dtype_api.h"
+#include "numpy/ndarraytypes.h"
+
+#include "../quad_common.h"
+#include "../scalar.h"
+#include "../dtype.h"
+#include "../ops.hpp"
+#include "binary_ops.h"
+#include "matmul.h"
+
+#include <iostream>
+
+static NPY_CASTING
+quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
+                                   PyArray_Descr *const given_descrs[],
+                                   PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset))
+{
+
+  NPY_CASTING casting = NPY_NO_CASTING;    
+  std::cout << "exiting the descriptor";
+  return casting;
+}
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+int
+quad_generic_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                          npy_intp const dimensions[], npy_intp const strides[],
+                                          NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    quad_value in1, in2, out;
+    while (N--) {
+        memcpy(&in1, in1_ptr, elem_size);
+        memcpy(&in2, in2_ptr, elem_size);
+        if (backend == BACKEND_SLEEF) {
+            out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value);
+        }
+        else {
+            out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value);
+        }
+        memcpy(out_ptr, &out, elem_size);
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+int
+quad_generic_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                        npy_intp const dimensions[], npy_intp const strides[],
+                                        NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+
+    while (N--) {
+        if (backend == BACKEND_SLEEF) {
+            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr);
+        }
+        else {
+            *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr);
+        }
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+int
+create_matmul_ufunc(PyObject *numpy, const char *ufunc_name)
+{
+    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+
+    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
+
+    PyType_Slot slots[] = {
+            {NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
+            {NPY_METH_strided_loop,
+             (void *)&quad_generic_matmul_strided_loop_aligned<sleef_op, longdouble_op>},
+            {NPY_METH_unaligned_strided_loop,
+             (void *)&quad_generic_matmul_strided_loop_unaligned<sleef_op, longdouble_op>},
+            {0, NULL}};
+
+    PyArrayMethod_Spec Spec = {
+            .name = "quad_matmul",
+            .nin = 2,
+            .nout = 1,
+            .casting = NPY_NO_CASTING,
+            .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE),
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        return -1;
+    }
+    // my guess we don't need any promoter here as of now, since matmul is quad specific
+    return 0;
+}
+
+
+int
+init_matmul_ops(PyObject *numpy)
+{
+    if (create_matmul_ufunc<quad_add>(numpy, "matmul") < 0) {
+        return -1;
+    }
+    return 0;
+}
+
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.h b/quaddtype/numpy_quaddtype/src/umath/matmul.h
new file mode 100644
index 00000000..bc099eba
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.h
@@ -0,0 +1,8 @@
+#ifndef _QUADDTYPE_MATMUL_OPS_H
+#define _QUADDTYPE_MATMUL_OPS_H
+
+#include <Python.h>
+
+int
+init_matmul_ops(PyObject *numpy);
+#endif
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/umath.cpp b/quaddtype/numpy_quaddtype/src/umath/umath.cpp
index 2ea864e7..50f95623 100644
--- a/quaddtype/numpy_quaddtype/src/umath/umath.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/umath.cpp
@@ -22,6 +22,7 @@ extern "C" {
 #include "unary_ops.h"
 #include "binary_ops.h"
 #include "comparison_ops.h"
+#include "matmul.h"
 
 // helper debugging function
 static const char *
@@ -101,10 +102,10 @@ init_quad_umath(void)
         goto err;
     }
 
-    // if (init_quad_matmul(numpy) < 0) {
-    //     PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad matrix multiplication operations");
-    //     goto err;
-    // }
+    if (init_matmul_ops(numpy) < 0) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad matrix multiplication operations");
+        goto err;
+    }
 
     Py_DECREF(numpy);
     return 0;

From 09918a310859e7879b3d2e2120e87a86e0ca0605 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Thu, 17 Jul 2025 15:50:04 +0530
Subject: [PATCH 08/20] mid-way test

---
 quaddtype/numpy_quaddtype/QBLAS               |   2 +-
 .../src/quadblas_interface.cpp                | 180 +++++++++---------
 .../numpy_quaddtype/src/umath/matmul.cpp      |  25 ++-
 quaddtype/release_tracker.md                  |  93 +++++++++
 4 files changed, 195 insertions(+), 105 deletions(-)
 create mode 100644 quaddtype/release_tracker.md

diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS
index 0eabb677..4853ac1c 160000
--- a/quaddtype/numpy_quaddtype/QBLAS
+++ b/quaddtype/numpy_quaddtype/QBLAS
@@ -1 +1 @@
-Subproject commit 0eabb677431c6148434c50deba7abd6902d74b16
+Subproject commit 4853ac1c7d3fa3016b61e9f2b9a43f49c06d891d
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index cce39c39..46f953de 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -786,95 +786,95 @@ py_quadblas_get_version(PyObject *self, PyObject *args)
     return PyUnicode_FromString(QuadBLAS::VERSION);
 }
 
-void matmul_op(Sleef_quad * inp1, Sleef_quad *inp2, Sleef_quad *out)
-{
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-    QuadBLAS::Layout layout_a, layout_b;
-
-    if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
-        !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    npy_intp result_dims[2] = {m, n};
-    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
-    if (!result_dtype) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    PyArrayObject *result =
-            (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
-    if (!result) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        Py_DECREF(result_dtype);
-        return nullptr;
-    }
-
-    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
-    for (npy_intp i = 0; i < m * n; i++) {
-        result_data[i] = Sleef_cast_from_doubleq1(0.0);
-    }
-
-    npy_intp lda, ldb, ldc;
-
-    if (layout_a == QuadBLAS::Layout::RowMajor) {
-        lda = k;
-    }
-    else {
-        lda = m;
-    }
-
-    if (layout_b == QuadBLAS::Layout::RowMajor) {
-        ldb = n;
-    }
-    else {
-        ldb = k;
-    }
-
-    QuadBLAS::Layout result_layout = layout_a;
-    if (result_layout == QuadBLAS::Layout::RowMajor) {
-        ldc = n;
-    }
-    else {
-        ldc = m;
-    }
-
-    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
-    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
-
-    QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data,
-                   ldc);
-
-    if (result_backend == BACKEND_LONGDOUBLE) {
-        long double *ld_result = (long double *)PyArray_DATA(result);
-        for (npy_intp i = 0; i < m * n; i++) {
-            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
-        }
-    }
-
-    QuadBLAS::aligned_free(temp_a);
-    QuadBLAS::aligned_free(temp_b);
-
-    return (PyObject *)result;
-}
+// void matmul_op(Sleef_quad * inp1, Sleef_quad *inp2, Sleef_quad *out)
+// {
+//     Sleef_quad *data_a, *data_b;
+//     QuadBackendType backend_a, backend_b;
+//     QuadBLAS::Layout layout_a, layout_b;
+
+//     if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
+//         !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
+//         return nullptr;
+//     }
+
+//     Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
+//     Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
+//     Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
+
+//     if (!sleef_a || !sleef_b) {
+//         QuadBLAS::aligned_free(temp_a);
+//         QuadBLAS::aligned_free(temp_b);
+//         return nullptr;
+//     }
+
+//     QuadBackendType result_backend = BACKEND_SLEEF;
+//     if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
+//         result_backend = BACKEND_LONGDOUBLE;
+//     }
+
+//     npy_intp result_dims[2] = {m, n};
+//     QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
+//     if (!result_dtype) {
+//         QuadBLAS::aligned_free(temp_a);
+//         QuadBLAS::aligned_free(temp_b);
+//         return nullptr;
+//     }
+
+//     PyArrayObject *result =
+//             (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
+//     if (!result) {
+//         QuadBLAS::aligned_free(temp_a);
+//         QuadBLAS::aligned_free(temp_b);
+//         Py_DECREF(result_dtype);
+//         return nullptr;
+//     }
+
+//     Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
+//     for (npy_intp i = 0; i < m * n; i++) {
+//         result_data[i] = Sleef_cast_from_doubleq1(0.0);
+//     }
+
+//     npy_intp lda, ldb, ldc;
+
+//     if (layout_a == QuadBLAS::Layout::RowMajor) {
+//         lda = k;
+//     }
+//     else {
+//         lda = m;
+//     }
+
+//     if (layout_b == QuadBLAS::Layout::RowMajor) {
+//         ldb = n;
+//     }
+//     else {
+//         ldb = k;
+//     }
+
+//     QuadBLAS::Layout result_layout = layout_a;
+//     if (result_layout == QuadBLAS::Layout::RowMajor) {
+//         ldc = n;
+//     }
+//     else {
+//         ldc = m;
+//     }
+
+//     Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
+//     Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
+
+//     QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data,
+//                    ldc);
+
+//     if (result_backend == BACKEND_LONGDOUBLE) {
+//         long double *ld_result = (long double *)PyArray_DATA(result);
+//         for (npy_intp i = 0; i < m * n; i++) {
+//             ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+//         }
+//     }
+
+//     QuadBLAS::aligned_free(temp_a);
+//     QuadBLAS::aligned_free(temp_b);
+
+//     return (PyObject *)result;
+// }
 
 #endif  // DISABLE_QUADBLAS
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index 47fdf5f5..01b8935d 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -5,7 +5,6 @@
 #define NO_IMPORT_ARRAY
 #define NO_IMPORT_UFUNC
 
-
 #include <Python.h>
 #include <cstdio>
 
@@ -25,20 +24,19 @@
 
 static NPY_CASTING
 quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
-                                   PyArray_Descr *const given_descrs[],
-                                   PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset))
+                                PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
+                                npy_intp *NPY_UNUSED(view_offset))
 {
-
-  NPY_CASTING casting = NPY_NO_CASTING;    
-  std::cout << "exiting the descriptor";
-  return casting;
+    NPY_CASTING casting = NPY_NO_CASTING;
+    std::cout << "exiting the descriptor";
+    return casting;
 }
 
 template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
 int
 quad_generic_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
-                                          npy_intp const dimensions[], npy_intp const strides[],
-                                          NpyAuxData *auxdata)
+                                           npy_intp const dimensions[], npy_intp const strides[],
+                                           NpyAuxData *auxdata)
 {
     npy_intp N = dimensions[0];
     char *in1_ptr = data[0], *in2_ptr = data[1];
@@ -73,8 +71,8 @@ quad_generic_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char
 template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
 int
 quad_generic_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
-                                        npy_intp const dimensions[], npy_intp const strides[],
-                                        NpyAuxData *auxdata)
+                                         npy_intp const dimensions[], npy_intp const strides[],
+                                         NpyAuxData *auxdata)
 {
     npy_intp N = dimensions[0];
     char *in1_ptr = data[0], *in2_ptr = data[1];
@@ -101,6 +99,7 @@ quad_generic_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *c
     return 0;
 }
 
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
 int
 create_matmul_ufunc(PyObject *numpy, const char *ufunc_name)
 {
@@ -136,13 +135,11 @@ create_matmul_ufunc(PyObject *numpy, const char *ufunc_name)
     return 0;
 }
 
-
 int
 init_matmul_ops(PyObject *numpy)
 {
-    if (create_matmul_ufunc<quad_add>(numpy, "matmul") < 0) {
+    if (create_matmul_ufunc<quad_add, ld_add>(numpy, "matmul") < 0) {
         return -1;
     }
     return 0;
 }
-
diff --git a/quaddtype/release_tracker.md b/quaddtype/release_tracker.md
new file mode 100644
index 00000000..1ecf7d3a
--- /dev/null
+++ b/quaddtype/release_tracker.md
@@ -0,0 +1,93 @@
+# Plan for `numpy-quaddtype` v1.5
+
+| ufunc name    | Added |
+| ------------- | ----- |
+| add           | ✅    |
+| subtract      | ✅    |
+| multiply      | ✅    |
+| matmul        | #116  |
+| divide        | ✅    |
+| logaddexp     |       |
+| logaddexp2    |       |
+| true_divide   |       |
+| floor_divide  |       |
+| negative      | ✅    |
+| positive      | ✅    |
+| power         | ✅    |
+| float_power   |       |
+| remainder     |       |
+| mod           | ✅    |
+| fmod          |       |
+| divmod        |       |
+| absolute      | ✅    |
+| fabs          |       |
+| rint          | ✅    |
+| sign          |       |
+| heaviside     |       |
+| conj          |       |
+| conjugate     |       |
+| exp           | ✅    |
+| exp2          | ✅    |
+| log           | ✅    |
+| log2          | ✅    |
+| log10         | ✅    |
+| expm1         |       |
+| log1p         | ✅    |
+| sqrt          | ✅    |
+| square        | ✅    |
+| cbrt          |       |
+| reciprocal    |       |
+| gcd           |       |
+| lcm           |       |
+| sin           | ✅    |
+| cos           | ✅    |
+| tan           | ✅    |
+| arcsin        | ✅    |
+| arccos        | ✅    |
+| arctan        | ✅    |
+| arctan2       | ✅    |
+| hypot         |       |
+| sinh          |       |
+| cosh          |       |
+| tanh          |       |
+| arcsinh       |       |
+| arccosh       |       |
+| arctanh       |       |
+| degrees       |       |
+| radians       |       |
+| deg2rad       |       |
+| rad2deg       |       |
+| bitwise_and   |       |
+| bitwise_or    |       |
+| bitwise_xor   |       |
+| invert        |       |
+| left_shift    |       |
+| right_shift   |       |
+| greater       | ✅    |
+| greater_equal | ✅    |
+| less          | ✅    |
+| less_equal    | ✅    |
+| not_equal     | ✅    |
+| equal         | ✅    |
+| logical_and   |       |
+| logical_or    |       |
+| logical_xor   |       |
+| logical_not   |       |
+| maximum       | ✅    |
+| minimum       | ✅    |
+| fmax          |       |
+| fmin          |       |
+| isfinite      |       |
+| isinf         |       |
+| isnan         |       |
+| isnat         |       |
+| signbit       |       |
+| copysign      |       |
+| nextafter     |       |
+| spacing       |       |
+| modf          |       |
+| ldexp         |       |
+| frexp         |       |
+| floor         | ✅    |
+| ceil          | ✅    |
+| trunc         | ✅    |

From 70ca6446bf8146759a0ca21710f0a908ff3e1eb9 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Thu, 17 Jul 2025 19:47:32 +0530
Subject: [PATCH 09/20] shifting to matmul ufunc

---
 quaddtype/numpy_quaddtype/__init__.py         |  3 +-
 .../numpy_quaddtype/src/quaddtype_main.c      | 69 ++++++++++---------
 quaddtype/tests/test_dot.py                   | 67 ++++++++----------
 3 files changed, 66 insertions(+), 73 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/__init__.py b/quaddtype/numpy_quaddtype/__init__.py
index b0a9f3be..8b588c11 100644
--- a/quaddtype/numpy_quaddtype/__init__.py
+++ b/quaddtype/numpy_quaddtype/__init__.py
@@ -3,7 +3,6 @@
     QuadPrecDType,
     is_longdouble_128,
     get_sleef_constant,
-    qblas_dot as dot,
     set_num_threads,
     get_num_threads,
     get_quadblas_version
@@ -17,7 +16,7 @@
     # Constants
     'pi', 'e', 'log2e', 'log10e', 'ln2', 'ln10', 'max_value', 'min_value', 'epsilon', 
     # QuadBLAS related functions
-    'dot', 'set_num_threads', 'get_num_threads', 'get_quadblas_version'
+    'set_num_threads', 'get_num_threads', 'get_quadblas_version'
 ]
 
 def SleefQuadPrecision(value):
diff --git a/quaddtype/numpy_quaddtype/src/quaddtype_main.c b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
index 641200d3..1e8fd535 100644
--- a/quaddtype/numpy_quaddtype/src/quaddtype_main.c
+++ b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
@@ -19,45 +19,55 @@
 #include "quadblas_interface.h"
 #include "float.h"
 
-
-static PyObject* py_is_longdouble_128(PyObject* self, PyObject* args) {
-    if(sizeof(long double) == 16 && 
-        LDBL_MANT_DIG == 113 && 
-        LDBL_MAX_EXP == 16384) {
+static PyObject *
+py_is_longdouble_128(PyObject *self, PyObject *args)
+{
+    if (sizeof(long double) == 16 && LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384) {
         Py_RETURN_TRUE;
-    } else {
+    }
+    else {
         Py_RETURN_FALSE;
     }
 }
 
-static PyObject* get_sleef_constant(PyObject* self, PyObject* args) {
-    const char* constant_name;
+static PyObject *
+get_sleef_constant(PyObject *self, PyObject *args)
+{
+    const char *constant_name;
     if (!PyArg_ParseTuple(args, "s", &constant_name)) {
         return NULL;
     }
 
-    QuadPrecisionObject* result = QuadPrecision_raw_new(BACKEND_SLEEF);
+    QuadPrecisionObject *result = QuadPrecision_raw_new(BACKEND_SLEEF);
     if (result == NULL) {
         return NULL;
     }
 
     if (strcmp(constant_name, "pi") == 0) {
         result->value.sleef_value = SLEEF_M_PIq;
-    } else if (strcmp(constant_name, "e") == 0) {
+    }
+    else if (strcmp(constant_name, "e") == 0) {
         result->value.sleef_value = SLEEF_M_Eq;
-    } else if (strcmp(constant_name, "log2e") == 0) {
+    }
+    else if (strcmp(constant_name, "log2e") == 0) {
         result->value.sleef_value = SLEEF_M_LOG2Eq;
-    } else if (strcmp(constant_name, "log10e") == 0) {
+    }
+    else if (strcmp(constant_name, "log10e") == 0) {
         result->value.sleef_value = SLEEF_M_LOG10Eq;
-    } else if (strcmp(constant_name, "ln2") == 0) {
+    }
+    else if (strcmp(constant_name, "ln2") == 0) {
         result->value.sleef_value = SLEEF_M_LN2q;
-    } else if (strcmp(constant_name, "ln10") == 0) {
+    }
+    else if (strcmp(constant_name, "ln10") == 0) {
         result->value.sleef_value = SLEEF_M_LN10q;
-    } else if (strcmp(constant_name, "quad_max") == 0) {
+    }
+    else if (strcmp(constant_name, "quad_max") == 0) {
         result->value.sleef_value = SLEEF_QUAD_MAX;
-    } else if (strcmp(constant_name, "quad_min") == 0) {
+    }
+    else if (strcmp(constant_name, "quad_min") == 0) {
         result->value.sleef_value = SLEEF_QUAD_MIN;
-    } else if (strcmp(constant_name, "epsilon") == 0) {
+    }
+    else if (strcmp(constant_name, "epsilon") == 0) {
         result->value.sleef_value = SLEEF_QUAD_EPSILON;
     }
     else {
@@ -66,26 +76,23 @@ static PyObject* get_sleef_constant(PyObject* self, PyObject* args) {
         return NULL;
     }
 
-    return (PyObject*)result;
+    return (PyObject *)result;
 }
 
 static PyMethodDef module_methods[] = {
-    {"is_longdouble_128", py_is_longdouble_128, METH_NOARGS, "Check if long double is 128-bit"},
-    {"get_sleef_constant", get_sleef_constant, METH_VARARGS, "Get Sleef constant by name"},
-    {"qblas_dot", py_quadblas_dot, METH_VARARGS, "Optimized dot product using QuadBLAS"},
-    {"set_num_threads", py_quadblas_set_num_threads, METH_VARARGS, "Set number of threads for QuadBLAS"},
-    {"get_num_threads", py_quadblas_get_num_threads, METH_NOARGS, "Get number of threads for QuadBLAS"},
-    {"get_quadblas_version", py_quadblas_get_version, METH_NOARGS, "Get QuadBLAS version"},
-    {NULL, NULL, 0, NULL} 
-};
+        {"is_longdouble_128", py_is_longdouble_128, METH_NOARGS, "Check if long double is 128-bit"},
+        {"get_sleef_constant", get_sleef_constant, METH_VARARGS, "Get Sleef constant by name"},
+        {"set_num_threads", py_quadblas_set_num_threads, METH_VARARGS,
+         "Set number of threads for QuadBLAS"},
+        {"get_num_threads", py_quadblas_get_num_threads, METH_NOARGS,
+         "Get number of threads for QuadBLAS"},
+        {"get_quadblas_version", py_quadblas_get_version, METH_NOARGS, "Get QuadBLAS version"},
+        {NULL, NULL, 0, NULL}};
 
 static struct PyModuleDef moduledef = {
-        PyModuleDef_HEAD_INIT,
-        .m_name = "_quaddtype_main",
+        PyModuleDef_HEAD_INIT, .m_name = "_quaddtype_main",
         .m_doc = "Quad (128-bit) floating point Data Type for NumPy with multiple backends",
-        .m_size = -1,
-        .m_methods = module_methods
-};
+        .m_size = -1, .m_methods = module_methods};
 
 PyMODINIT_FUNC
 PyInit__quaddtype_main(void)
diff --git a/quaddtype/tests/test_dot.py b/quaddtype/tests/test_dot.py
index ed135f47..f3fa3f62 100644
--- a/quaddtype/tests/test_dot.py
+++ b/quaddtype/tests/test_dot.py
@@ -1,19 +1,6 @@
-"""
-Focused test suite for the dot function in numpy_quaddtype
-
-This module tests the QuadBLAS dot function for:
-- Vector-vector dot products
-- Matrix-vector multiplication  
-- Matrix-matrix multiplication
-- Small and large matrix operations
-- Basic correctness validation
-
-Uses only the Sleef backend for simplicity.
-"""
-
 import pytest
 import numpy as np
-from numpy_quaddtype import QuadPrecision, QuadPrecDType, dot
+from numpy_quaddtype import QuadPrecision, QuadPrecDType
 
 
 # ================================================================================
@@ -81,14 +68,14 @@ def create_quad_array(values, shape=None):
 # ================================================================================
 
 class TestVectorVectorDot:
-    """Test vector-vector dot products"""
+    """Test vector-vector np.matmul products"""
     
     def test_simple_dot_product(self):
-        """Test basic vector dot product"""
+        """Test basic vector np.matmul product"""
         x = create_quad_array([1, 2, 3])
         y = create_quad_array([4, 5, 6])
         
-        result = dot(x, y)
+        result = np.matmul(x, y)
         expected = 1*4 + 2*5 + 3*6  # = 32
         
         assert isinstance(result, QuadPrecision)
@@ -99,14 +86,14 @@ def test_orthogonal_vectors(self):
         x = create_quad_array([1, 0, 0])
         y = create_quad_array([0, 1, 0])
         
-        result = dot(x, y)
+        result = np.matmul(x, y)
         assert_quad_equal(result, 0.0)
     
     def test_same_vector(self):
-        """Test dot product of vector with itself"""
+        """Test np.matmul product of vector with itself"""
         x = create_quad_array([2, 3, 4])
         
-        result = dot(x, x)
+        result = np.matmul(x, x)
         expected = 2*2 + 3*3 + 4*4  # = 29
         
         assert_quad_equal(result, expected)
@@ -121,7 +108,7 @@ def test_various_vector_sizes(self, size):
         x = create_quad_array(x_vals)
         y = create_quad_array(y_vals)
         
-        result = dot(x, y)
+        result = np.matmul(x, y)
         expected = sum(x_vals[i] * y_vals[i] for i in range(size))
         
         assert_quad_equal(result, expected)
@@ -131,7 +118,7 @@ def test_negative_and_fractional_values(self):
         x = create_quad_array([1.5, -2.5, 3.25])
         y = create_quad_array([-1.25, 2.75, -3.5])
         
-        result = dot(x, y)
+        result = np.matmul(x, y)
         expected = 1.5*(-1.25) + (-2.5)*2.75 + 3.25*(-3.5)
         
         assert_quad_equal(result, expected)
@@ -151,7 +138,7 @@ def test_simple_matrix_vector(self):
         # 3x1 vector  
         x = create_quad_array([1, 1, 1])
         
-        result = dot(A, x)
+        result = np.matmul(A, x)
         expected = [1+2+3, 4+5+6]  # [6, 15]
         
         assert result.shape == (2,)
@@ -164,7 +151,7 @@ def test_identity_matrix_vector(self):
         I = create_quad_array([1, 0, 0, 0, 1, 0, 0, 0, 1], shape=(3, 3))
         x = create_quad_array([2, 3, 4])
         
-        result = dot(I, x)
+        result = np.matmul(I, x)
         
         assert result.shape == (3,)
         for i in range(3):
@@ -181,7 +168,7 @@ def test_various_matrix_vector_sizes(self, m, n):
         x_vals = [i + 1 for i in range(n)]
         x = create_quad_array(x_vals)
         
-        result = dot(A, x)
+        result = np.matmul(A, x)
         
         assert result.shape == (m,)
         
@@ -205,7 +192,7 @@ def test_simple_matrix_matrix(self):
         A = create_quad_array([1, 2, 3, 4], shape=(2, 2))
         B = create_quad_array([5, 6, 7, 8], shape=(2, 2))
         
-        result = dot(A, B)
+        result = np.matmul(A, B)
         
         # Expected: [[1*5+2*7, 1*6+2*8], [3*5+4*7, 3*6+4*8]] = [[19, 22], [43, 50]]
         expected = [[19, 22], [43, 50]]
@@ -221,11 +208,11 @@ def test_identity_matrix_multiplication(self):
         I = create_quad_array([1, 0, 0, 1], shape=(2, 2))
         
         # A * I should equal A
-        result1 = dot(A, I)
+        result1 = np.matmul(A, I)
         assert_quad_array_equal(result1, A)
         
         # I * A should equal A  
-        result2 = dot(I, A)
+        result2 = np.matmul(I, A)
         assert_quad_array_equal(result2, A)
     
     @pytest.mark.parametrize("m,n,k", [(2,2,2), (2,3,4), (3,2,5), (4,4,4), (5,6,7)])
@@ -239,7 +226,7 @@ def test_various_matrix_sizes(self, m, n, k):
         B_vals = [(i*n + j + 1) for i in range(k) for j in range(n)]
         B = create_quad_array(B_vals, shape=(k, n))
         
-        result = dot(A, B)
+        result = np.matmul(A, B)
         
         assert result.shape == (m, n)
         
@@ -258,12 +245,12 @@ def test_associativity(self):
         C = create_quad_array([1, 1, 2, 1], shape=(2, 2))
         
         # Compute (A*B)*C
-        AB = dot(A, B)
-        result1 = dot(AB, C)
+        AB = np.matmul(A, B)
+        result1 = np.matmul(AB, C)
         
         # Compute A*(B*C)
-        BC = dot(B, C)
-        result2 = dot(A, BC)
+        BC = np.matmul(B, C)
+        result2 = np.matmul(A, BC)
         
         assert_quad_array_equal(result1, result2, rtol=1e-25)
 
@@ -285,7 +272,7 @@ def test_large_square_matrices(self, size):
         A = create_quad_array(A_vals, shape=(size, size))
         B = create_quad_array(B_vals, shape=(size, size))
         
-        result = dot(A, B)
+        result = np.matmul(A, B)
         
         assert result.shape == (size, size)
         
@@ -303,7 +290,7 @@ def test_large_square_matrices(self, size):
             assert_quad_equal(result[size//2, size//2], expected_value, rtol=1e-15, atol=1e-15)
     
     def test_large_vector_operations(self):
-        """Test large vector dot products"""
+        """Test large vector np.matmul products"""
         size = 1000
         
         # Create vectors with known sum
@@ -313,7 +300,7 @@ def test_large_vector_operations(self):
         x = create_quad_array(x_vals)
         y = create_quad_array(y_vals)
         
-        result = dot(x, y)
+        result = np.matmul(x, y)
         expected = size * 1.0 * 2.0  # = 2000.0
         
         assert_quad_equal(result, expected)
@@ -329,7 +316,7 @@ def test_rectangular_large_matrices(self):
         A = create_quad_array(A_vals, shape=(m, k))
         B = create_quad_array(B_vals, shape=(k, n))
         
-        result = dot(A, B)
+        result = np.matmul(A, B)
         
         assert result.shape == (m, n)
         
@@ -354,7 +341,7 @@ def test_dimension_mismatch_vectors(self):
         y = create_quad_array([1, 2, 3])
         
         with pytest.raises(ValueError, match="same length"):
-            dot(x, y)
+            np.matmul(x, y)
     
     def test_dimension_mismatch_matrix_vector(self):
         """Test dimension mismatch in matrix-vector"""
@@ -362,7 +349,7 @@ def test_dimension_mismatch_matrix_vector(self):
         x = create_quad_array([1, 2, 3])  # Wrong size
         
         with pytest.raises(ValueError, match="columns must match"):
-            dot(A, x)
+            np.matmul(A, x)
     
     def test_dimension_mismatch_matrices(self):
         """Test dimension mismatch in matrix-matrix"""
@@ -370,7 +357,7 @@ def test_dimension_mismatch_matrices(self):
         B = create_quad_array([1, 2, 3, 4, 5, 6], shape=(3, 2))  # Wrong size
         
         with pytest.raises(ValueError, match="Matrix inner dimensions must match"):
-            dot(A, B)
+            np.matmul(A, B)
 
 
 if __name__ == "__main__":

From f89c2e6a1bc66433a3c3dd8c3cd0e9a101b400e2 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Fri, 18 Jul 2025 01:22:58 +0530
Subject: [PATCH 10/20] will figure out later

---
 .github/workflows/build_wheels.yml            |  2 +-
 .github/workflows/ci.yml                      |  6 +-
 .../src/quadblas_interface.cpp                | 91 -------------------
 3 files changed, 4 insertions(+), 95 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index e16e2ff4..0bf55c46 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -3,7 +3,7 @@ name: Build Wheels
 on:
   push:
     branches:
-      - matmul-ufunc
+      - dot
     tags:
       - "quaddtype-v*"
     paths:
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a70e8c10..e42c12c8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,7 +3,7 @@ name: Numpy User DTypes CI
 on:
   push:
     branches:
-      - matmul-ufunc
+      - dot
   pull_request:
   workflow_dispatch:
 
@@ -76,13 +76,13 @@ jobs:
           # Initialize submodules first
           git submodule update --init --recursive
           ls -la numpy_quaddtype/QBLAS/
-          
+
           # Set environment variables with proper export and correct paths
           export CFLAGS="-I/usr/local/include -I$(pwd)/numpy_quaddtype/QBLAS/include"
           export CXXFLAGS="-I/usr/local/include -I$(pwd)/numpy_quaddtype/QBLAS/include -fext-numeric-literals"
           export LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp"
           export LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
-          
+
           # Install with meson args to ensure the C++ flags are passed through
           python -m pip install . -v --no-build-isolation \
             -Cbuilddir=build \
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index 46f953de..6bc3fb0d 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -786,95 +786,4 @@ py_quadblas_get_version(PyObject *self, PyObject *args)
     return PyUnicode_FromString(QuadBLAS::VERSION);
 }
 
-// void matmul_op(Sleef_quad * inp1, Sleef_quad *inp2, Sleef_quad *out)
-// {
-//     Sleef_quad *data_a, *data_b;
-//     QuadBackendType backend_a, backend_b;
-//     QuadBLAS::Layout layout_a, layout_b;
-
-//     if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
-//         !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
-//         return nullptr;
-//     }
-
-//     Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-//     Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
-//     Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
-
-//     if (!sleef_a || !sleef_b) {
-//         QuadBLAS::aligned_free(temp_a);
-//         QuadBLAS::aligned_free(temp_b);
-//         return nullptr;
-//     }
-
-//     QuadBackendType result_backend = BACKEND_SLEEF;
-//     if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-//         result_backend = BACKEND_LONGDOUBLE;
-//     }
-
-//     npy_intp result_dims[2] = {m, n};
-//     QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
-//     if (!result_dtype) {
-//         QuadBLAS::aligned_free(temp_a);
-//         QuadBLAS::aligned_free(temp_b);
-//         return nullptr;
-//     }
-
-//     PyArrayObject *result =
-//             (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
-//     if (!result) {
-//         QuadBLAS::aligned_free(temp_a);
-//         QuadBLAS::aligned_free(temp_b);
-//         Py_DECREF(result_dtype);
-//         return nullptr;
-//     }
-
-//     Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
-//     for (npy_intp i = 0; i < m * n; i++) {
-//         result_data[i] = Sleef_cast_from_doubleq1(0.0);
-//     }
-
-//     npy_intp lda, ldb, ldc;
-
-//     if (layout_a == QuadBLAS::Layout::RowMajor) {
-//         lda = k;
-//     }
-//     else {
-//         lda = m;
-//     }
-
-//     if (layout_b == QuadBLAS::Layout::RowMajor) {
-//         ldb = n;
-//     }
-//     else {
-//         ldb = k;
-//     }
-
-//     QuadBLAS::Layout result_layout = layout_a;
-//     if (result_layout == QuadBLAS::Layout::RowMajor) {
-//         ldc = n;
-//     }
-//     else {
-//         ldc = m;
-//     }
-
-//     Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
-//     Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
-
-//     QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data,
-//                    ldc);
-
-//     if (result_backend == BACKEND_LONGDOUBLE) {
-//         long double *ld_result = (long double *)PyArray_DATA(result);
-//         for (npy_intp i = 0; i < m * n; i++) {
-//             ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
-//         }
-//     }
-
-//     QuadBLAS::aligned_free(temp_a);
-//     QuadBLAS::aligned_free(temp_b);
-
-//     return (PyObject *)result;
-// }
-
 #endif  // DISABLE_QUADBLAS
\ No newline at end of file

From 894a84db72cf4f9ec3b0367ae669102cb61eecda Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 13:46:32 +0530
Subject: [PATCH 11/20] matmul registered with naive

---
 .../numpy_quaddtype/src/umath/matmul.cpp      | 264 +++++++++++++-----
 quaddtype/numpy_quaddtype/src/umath/matmul.h  |  38 ++-
 quaddtype/tests/test_dot.py                   |   6 +-
 3 files changed, 225 insertions(+), 83 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index 01b8935d..00cc8589 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -5,141 +5,251 @@
 #define NO_IMPORT_ARRAY
 #define NO_IMPORT_UFUNC
 
+extern "C" {
 #include <Python.h>
 #include <cstdio>
+#include <string.h>
 
 #include "numpy/arrayobject.h"
+#include "numpy/ndarraytypes.h"
 #include "numpy/ufuncobject.h"
 #include "numpy/dtype_api.h"
-#include "numpy/ndarraytypes.h"
+}
 
 #include "../quad_common.h"
 #include "../scalar.h"
 #include "../dtype.h"
 #include "../ops.hpp"
-#include "binary_ops.h"
 #include "matmul.h"
+#include "promoters.hpp"
 
-#include <iostream>
-
+/**
+ * Resolve descriptors for matmul operation.
+ * Follows the same pattern as binary_ops.cpp
+ */
 static NPY_CASTING
 quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
                                 PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
                                 npy_intp *NPY_UNUSED(view_offset))
 {
-    NPY_CASTING casting = NPY_NO_CASTING;
-    std::cout << "exiting the descriptor";
-    return casting;
-}
+    // Follow the exact same pattern as quad_binary_op_resolve_descriptors
+    QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
+    QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
+    QuadBackendType target_backend;
 
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
-int
-quad_generic_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
-                                           npy_intp const dimensions[], npy_intp const strides[],
-                                           NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+    // Determine target backend and if casting is needed
+    NPY_CASTING casting = NPY_NO_CASTING;
+    if (descr_in1->backend != descr_in2->backend) {
+        target_backend = BACKEND_LONGDOUBLE;
+        casting = NPY_SAFE_CASTING;
+    }
+    else {
+        target_backend = descr_in1->backend;
+    }
 
-    quad_value in1, in2, out;
-    while (N--) {
-        memcpy(&in1, in1_ptr, elem_size);
-        memcpy(&in2, in2_ptr, elem_size);
-        if (backend == BACKEND_SLEEF) {
-            out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value);
+    // Set up input descriptors, casting if necessary
+    for (int i = 0; i < 2; i++) {
+        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
+            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[i]) {
+                return (NPY_CASTING)-1;
+            }
         }
         else {
-            out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value);
+            Py_INCREF(given_descrs[i]);
+            loop_descrs[i] = given_descrs[i];
         }
-        memcpy(out_ptr, &out, elem_size);
+    }
 
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
+    // Set up output descriptor
+    if (given_descrs[2] == NULL) {
+        loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+        if (!loop_descrs[2]) {
+            return (NPY_CASTING)-1;
+        }
     }
-    return 0;
+    else {
+        QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2];
+        if (descr_out->backend != target_backend) {
+            loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[2]) {
+                return (NPY_CASTING)-1;
+            }
+        }
+        else {
+            Py_INCREF(given_descrs[2]);
+            loop_descrs[2] = given_descrs[2];
+        }
+    }
+    return casting;
 }
 
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
-int
-quad_generic_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
-                                         npy_intp const dimensions[], npy_intp const strides[],
-                                         NpyAuxData *auxdata)
+/**
+ * Matrix multiplication strided loop using NumPy 2.0 API.
+ * Implements general matrix multiplication for arbitrary dimensions.
+ *
+ * For matmul with signature (m?,n),(n,p?)->(m?,p?):
+ * - dimensions[0] = N (loop dimension, number of batch operations)
+ * - dimensions[1] = m (rows of first matrix)
+ * - dimensions[2] = n (cols of first matrix / rows of second matrix)
+ * - dimensions[3] = p (cols of second matrix)
+ *
+ * - strides[0], strides[1], strides[2] = batch strides for A, B, C
+ * - strides[3], strides[4] = row stride, col stride for A (m, n)
+ * - strides[5], strides[6] = row stride, col stride for B (n, p)
+ * - strides[7], strides[8] = row stride, col stride for C (m, p)
+ */
+static int
+quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
+                         npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata)
 {
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
+    // Extract dimensions
+    npy_intp N = dimensions[0];  // Number of batch operations
+    npy_intp m = dimensions[1];  // Rows of first matrix
+    npy_intp n = dimensions[2];  // Cols of first matrix / rows of second matrix
+    npy_intp p = dimensions[3];  // Cols of second matrix
+
+    // Extract batch strides
+    npy_intp A_batch_stride = strides[0];
+    npy_intp B_batch_stride = strides[1];
+    npy_intp C_batch_stride = strides[2];
+
+    // Extract core strides for matrix dimensions
+    npy_intp A_row_stride = strides[3];  // Stride along m dimension of A
+    npy_intp A_col_stride = strides[4];  // Stride along n dimension of A
+    npy_intp B_row_stride = strides[5];  // Stride along n dimension of B
+    npy_intp B_col_stride = strides[6];  // Stride along p dimension of B
+    npy_intp C_row_stride = strides[7];  // Stride along m dimension of C
+    npy_intp C_col_stride = strides[8];  // Stride along p dimension of C
+
+    // Get backend from descriptor
     QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
     QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
 
-    while (N--) {
-        if (backend == BACKEND_SLEEF) {
-            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr);
-        }
-        else {
-            *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr);
+    // Process each batch
+    for (npy_intp batch = 0; batch < N; batch++) {
+        char *A_batch = data[0] + batch * A_batch_stride;
+        char *B_batch = data[1] + batch * B_batch_stride;
+        char *C_batch = data[2] + batch * C_batch_stride;
+
+        // Perform matrix multiplication: C = A @ B
+        // C[i,j] = sum_k(A[i,k] * B[k,j])
+        for (npy_intp i = 0; i < m; i++) {
+            for (npy_intp j = 0; j < p; j++) {
+                char *C_ij = C_batch + i * C_row_stride + j * C_col_stride;
+
+                if (backend == BACKEND_SLEEF) {
+                    Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);  // Initialize to 0
+
+                    for (npy_intp k = 0; k < n; k++) {
+                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
+                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+
+                        Sleef_quad a_val, b_val;
+                        memcpy(&a_val, A_ik, sizeof(Sleef_quad));
+                        memcpy(&b_val, B_kj, sizeof(Sleef_quad));
+
+                        // sum += A[i,k] * B[k,j]
+                        sum = Sleef_addq1_u05(sum, Sleef_mulq1_u05(a_val, b_val));
+                    }
+
+                    memcpy(C_ij, &sum, sizeof(Sleef_quad));
+                }
+                else {
+                    // Long double backend
+                    long double sum = 0.0L;
+
+                    for (npy_intp k = 0; k < n; k++) {
+                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
+                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+
+                        long double a_val, b_val;
+                        memcpy(&a_val, A_ik, sizeof(long double));
+                        memcpy(&b_val, B_kj, sizeof(long double));
+
+                        sum += a_val * b_val;
+                    }
+
+                    memcpy(C_ij, &sum, sizeof(long double));
+                }
+            }
         }
-
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
     }
+
     return 0;
 }
 
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+/**
+ * Register matmul support following the exact same pattern as binary_ops.cpp
+ */
 int
-create_matmul_ufunc(PyObject *numpy, const char *ufunc_name)
+init_matmul_ops(PyObject *numpy)
 {
-    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    printf("DEBUG: init_matmul_ops - registering matmul using NumPy 2.0 API\n");
+
+    // Get the existing matmul ufunc - same pattern as binary_ops
+    PyObject *ufunc = PyObject_GetAttrString(numpy, "matmul");
     if (ufunc == NULL) {
+        printf("DEBUG: Failed to get numpy.matmul\n");
         return -1;
     }
 
+    // Use the same pattern as binary_ops.cpp
     PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
 
-    PyType_Slot slots[] = {
-            {NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
-            {NPY_METH_strided_loop,
-             (void *)&quad_generic_matmul_strided_loop_aligned<sleef_op, longdouble_op>},
-            {NPY_METH_unaligned_strided_loop,
-             (void *)&quad_generic_matmul_strided_loop_unaligned<sleef_op, longdouble_op>},
-            {0, NULL}};
+    PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
+                           {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop},
+                           {NPY_METH_unaligned_strided_loop, (void *)&quad_matmul_strided_loop},
+                           {0, NULL}};
 
     PyArrayMethod_Spec Spec = {
             .name = "quad_matmul",
             .nin = 2,
             .nout = 1,
             .casting = NPY_NO_CASTING,
-            .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE),
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
             .dtypes = dtypes,
             .slots = slots,
     };
 
+    printf("DEBUG: About to add loop to matmul ufunc...\n");
+
     if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        printf("DEBUG: Failed to add loop to matmul ufunc\n");
+        Py_DECREF(ufunc);
         return -1;
     }
-    // my guess we don't need any promoter here as of now, since matmul is quad specific
-    return 0;
-}
 
-int
-init_matmul_ops(PyObject *numpy)
-{
-    if (create_matmul_ufunc<quad_add, ld_add>(numpy, "matmul") < 0) {
+    printf("DEBUG: Successfully added matmul loop!\n");
+
+    // Add promoter following binary_ops pattern
+    PyObject *promoter_capsule =
+            PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL);
+    if (promoter_capsule == NULL) {
+        Py_DECREF(ufunc);
+        return -1;
+    }
+
+    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type);
+    if (DTypes == NULL) {
+        Py_DECREF(promoter_capsule);
+        Py_DECREF(ufunc);
         return -1;
     }
+
+    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
+        printf("DEBUG: Failed to add promoter (continuing anyway)\n");
+        PyErr_Clear();  // Don't fail if promoter fails
+    }
+    else {
+        printf("DEBUG: Successfully added promoter\n");
+    }
+
+    Py_DECREF(DTypes);
+    Py_DECREF(promoter_capsule);
+    Py_DECREF(ufunc);
+
+    printf("DEBUG: init_matmul_ops completed successfully\n");
     return 0;
-}
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.h b/quaddtype/numpy_quaddtype/src/umath/matmul.h
index bc099eba..947e2c3d 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.h
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.h
@@ -1,8 +1,40 @@
-#ifndef _QUADDTYPE_MATMUL_OPS_H
-#define _QUADDTYPE_MATMUL_OPS_H
+#ifndef _QUADDTYPE_MATMUL_H
+#define _QUADDTYPE_MATMUL_H
+
+/**
+ * Quad Precision Matrix Multiplication for NumPy
+ *
+ * This module implements matrix multiplication functionality for the QuadPrecDType
+ * by registering custom loops with numpy's matmul generalized ufunc.
+ *
+ * Supports all matmul operation types:
+ * - Vector-vector (dot product): (n,) @ (n,) -> scalar
+ * - Matrix-vector: (m,n) @ (n,) -> (m,)
+ * - Vector-matrix: (n,) @ (n,p) -> (p,)
+ * - Matrix-matrix: (m,n) @ (n,p) -> (m,p)
+ *
+ * Uses naive algorithms optimized for correctness rather than performance.
+ * For production use, consider integration with QBLAS optimized routines.
+ */
 
 #include <Python.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Initialize the matmul operations for the quad precision dtype.
+ * This function registers the matmul generalized ufunc with numpy.
+ *
+ * @param numpy The numpy module object
+ * @return 0 on success, -1 on failure
+ */
 int
 init_matmul_ops(PyObject *numpy);
-#endif
\ No newline at end of file
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // _QUADDTYPE_MATMUL_H
\ No newline at end of file
diff --git a/quaddtype/tests/test_dot.py b/quaddtype/tests/test_dot.py
index f3fa3f62..9256f3d8 100644
--- a/quaddtype/tests/test_dot.py
+++ b/quaddtype/tests/test_dot.py
@@ -340,7 +340,7 @@ def test_dimension_mismatch_vectors(self):
         x = create_quad_array([1, 2])
         y = create_quad_array([1, 2, 3])
         
-        with pytest.raises(ValueError, match="same length"):
+        with pytest.raises(ValueError, match=r"matmul: Input operand 1 has a mismatch in its core dimension 0"):
             np.matmul(x, y)
     
     def test_dimension_mismatch_matrix_vector(self):
@@ -348,7 +348,7 @@ def test_dimension_mismatch_matrix_vector(self):
         A = create_quad_array([1, 2, 3, 4], shape=(2, 2))
         x = create_quad_array([1, 2, 3])  # Wrong size
         
-        with pytest.raises(ValueError, match="columns must match"):
+        with pytest.raises(ValueError, match=r"matmul: Input operand 1 has a mismatch in its core dimension 0"):
             np.matmul(A, x)
     
     def test_dimension_mismatch_matrices(self):
@@ -356,7 +356,7 @@ def test_dimension_mismatch_matrices(self):
         A = create_quad_array([1, 2, 3, 4], shape=(2, 2))
         B = create_quad_array([1, 2, 3, 4, 5, 6], shape=(3, 2))  # Wrong size
         
-        with pytest.raises(ValueError, match="Matrix inner dimensions must match"):
+        with pytest.raises(ValueError, match=r"matmul: Input operand 1 has a mismatch in its core dimension 0"):
             np.matmul(A, B)
 
 

From 6800a906b1fbcdd363f02482c5b6f225463cc456 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 16:29:09 +0530
Subject: [PATCH 12/20] adding initial qblas support to matmul ufunc, something
 is breaking, nan

---
 .../src/quadblas_interface.cpp                | 806 ++----------------
 .../numpy_quaddtype/src/quadblas_interface.h  |  34 +-
 .../numpy_quaddtype/src/umath/matmul.cpp      | 263 +++---
 3 files changed, 257 insertions(+), 846 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index 6bc3fb0d..185e2d87 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -1,773 +1,126 @@
-#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
-#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
-#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
-#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
-#define NO_IMPORT_ARRAY
-#define NO_IMPORT_UFUNC
-
-extern "C" {
-#include <Python.h>
-#include "numpy/arrayobject.h"
-#include "numpy/ndarraytypes.h"
-#include "numpy/dtype_api.h"
-}
-
-#include "scalar.h"
-#include "dtype.h"
-#include "quad_common.h"
 #include "quadblas_interface.h"
-
-extern "C" {
-#include <sleef.h>
-#include <sleefquad.h>
-}
-
-#ifndef DISABLE_QUADBLAS
 #include "../QBLAS/include/quadblas/quadblas.hpp"
-#endif
-
-#ifdef DISABLE_QUADBLAS
-
-static bool
-extract_quad_array_info_simple(PyArrayObject *arr, Sleef_quad **data, QuadBackendType *backend)
-{
-    if (!PyArray_Check(arr)) {
-        PyErr_SetString(PyExc_TypeError, "Expected numpy array");
-        return false;
-    }
-
-    PyArray_Descr *descr = PyArray_DESCR(arr);
-    if (!PyObject_TypeCheck(descr, (PyTypeObject *)&QuadPrecDType)) {
-        PyErr_SetString(PyExc_TypeError, "Array must have QuadPrecDType dtype");
-        return false;
-    }
-
-    QuadPrecDTypeObject *quad_descr = (QuadPrecDTypeObject *)descr;
-    *backend = quad_descr->backend;
-    *data = (Sleef_quad *)PyArray_DATA(arr);
-
-    return true;
-}
-
-static Sleef_quad *
-ensure_sleef_backend_simple(PyArrayObject *arr, QuadBackendType original_backend,
-                            Sleef_quad **temp_storage)
-{
-    if (original_backend == BACKEND_SLEEF) {
-        *temp_storage = nullptr;
-        return (Sleef_quad *)PyArray_DATA(arr);
-    }
-
-    npy_intp size = PyArray_SIZE(arr);
-    *temp_storage = (Sleef_quad *)malloc(size * sizeof(Sleef_quad));
-    if (!*temp_storage) {
-        PyErr_NoMemory();
-        return nullptr;
-    }
-
-    long double *ld_data = (long double *)PyArray_DATA(arr);
-    for (npy_intp i = 0; i < size; i++) {
-        (*temp_storage)[i] = Sleef_cast_from_doubleq1((double)ld_data[i]);
-    }
+#include <cstring>
+#include <algorithm>
 
-    return *temp_storage;
-}
-
-// ===============================================================================
-// FALLBACK IMPLEMENTATIONS (No QuadBLAS)
-// ===============================================================================
+extern "C" {
 
-static PyObject *
-dot_vector_vector_fallback(PyArrayObject *a, PyArrayObject *b)
+int
+qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result)
 {
-    if (PyArray_NDIM(a) != 1 || PyArray_NDIM(b) != 1) {
-        PyErr_SetString(PyExc_ValueError, "Both inputs must be 1-dimensional arrays");
-        return nullptr;
-    }
-
-    npy_intp n_a = PyArray_DIM(a, 0);
-    npy_intp n_b = PyArray_DIM(b, 0);
-
-    if (n_a != n_b) {
-        PyErr_SetString(PyExc_ValueError, "Arrays must have the same length");
-        return nullptr;
-    }
-
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-
-    if (!extract_quad_array_info_simple(a, &data_a, &backend_a) ||
-        !extract_quad_array_info_simple(b, &data_b, &backend_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend_simple(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend_simple(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        free(temp_a);
-        free(temp_b);
-        return nullptr;
-    }
-
-    // Simple dot product implementation
-    Sleef_quad result = Sleef_cast_from_doubleq1(0.0);
-    for (npy_intp i = 0; i < n_a; i++) {
-        result = Sleef_fmaq1_u05(sleef_a[i], sleef_b[i], result);
-    }
-
-    free(temp_a);
-    free(temp_b);
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    QuadPrecisionObject *result_obj = QuadPrecision_raw_new(result_backend);
-    if (!result_obj) {
-        return nullptr;
+    if (!x || !y || !result || n == 0) {
+        return -1;
     }
 
-    if (result_backend == BACKEND_SLEEF) {
-        result_obj->value.sleef_value = result;
+    try {
+        *result = QuadBLAS::dot(n, x, incx, y, incy);
+        return 0;
     }
-    else {
-        result_obj->value.longdouble_value = (long double)Sleef_cast_to_doubleq1(result);
+    catch (...) {
+        return -1;
     }
-
-    return (PyObject *)result_obj;
 }
 
-static PyObject *
-dot_matrix_vector_fallback(PyArrayObject *a, PyArrayObject *b)
+int
+qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A,
+           size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy)
 {
-    if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 1) {
-        PyErr_SetString(PyExc_ValueError, "First input must be 2D, second input must be 1D");
-        return nullptr;
-    }
-
-    npy_intp m = PyArray_DIM(a, 0);
-    npy_intp n = PyArray_DIM(a, 1);
-    npy_intp n_b = PyArray_DIM(b, 0);
-
-    if (n != n_b) {
-        PyErr_SetString(PyExc_ValueError, "Matrix columns must match vector length");
-        return nullptr;
-    }
-
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-
-    if (!extract_quad_array_info_simple(a, &data_a, &backend_a) ||
-        !extract_quad_array_info_simple(b, &data_b, &backend_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend_simple(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend_simple(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        free(temp_a);
-        free(temp_b);
-        return nullptr;
-    }
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    npy_intp result_dims[1] = {m};
-    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
-    if (!result_dtype) {
-        free(temp_a);
-        free(temp_b);
-        return nullptr;
-    }
-
-    PyArrayObject *result =
-            (PyArrayObject *)PyArray_Empty(1, result_dims, (PyArray_Descr *)result_dtype, 0);
-    if (!result) {
-        free(temp_a);
-        free(temp_b);
-        Py_DECREF(result_dtype);
-        return nullptr;
-    }
-
-    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
-
-    // Initialize result to zero
-    for (npy_intp i = 0; i < m; i++) {
-        result_data[i] = Sleef_cast_from_doubleq1(0.0);
+    if (!alpha || !A || !x || !beta || !y || m == 0 || n == 0) {
+        return -1;
     }
 
-    // Simple matrix-vector multiplication: result[i] = sum(A[i,j] * b[j])
-    for (npy_intp i = 0; i < m; i++) {
-        Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);
-        for (npy_intp j = 0; j < n; j++) {
-            // Assume row-major layout: A[i,j] = sleef_a[i*n + j]
-            sum = Sleef_fmaq1_u05(sleef_a[i * n + j], sleef_b[j], sum);
+    try {
+        // Convert layout
+        QuadBLAS::Layout qblas_layout;
+        if (layout == 'R' || layout == 'r') {
+            qblas_layout = QuadBLAS::Layout::RowMajor;
         }
-        result_data[i] = sum;
-    }
-
-    // Convert to longdouble if needed
-    if (result_backend == BACKEND_LONGDOUBLE) {
-        long double *ld_result = (long double *)PyArray_DATA(result);
-        for (npy_intp i = 0; i < m; i++) {
-            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+        else if (layout == 'C' || layout == 'c') {
+            qblas_layout = QuadBLAS::Layout::ColMajor;
         }
-    }
-
-    free(temp_a);
-    free(temp_b);
-
-    return (PyObject *)result;
-}
-
-static PyObject *
-dot_matrix_matrix_fallback(PyArrayObject *a, PyArrayObject *b)
-{
-    if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 2) {
-        PyErr_SetString(PyExc_ValueError, "Both inputs must be 2-dimensional arrays");
-        return nullptr;
-    }
-
-    npy_intp m = PyArray_DIM(a, 0);
-    npy_intp k = PyArray_DIM(a, 1);
-    npy_intp k_b = PyArray_DIM(b, 0);
-    npy_intp n = PyArray_DIM(b, 1);
-
-    if (k != k_b) {
-        PyErr_SetString(PyExc_ValueError, "Matrix inner dimensions must match");
-        return nullptr;
-    }
-
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-
-    if (!extract_quad_array_info_simple(a, &data_a, &backend_a) ||
-        !extract_quad_array_info_simple(b, &data_b, &backend_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend_simple(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend_simple(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        free(temp_a);
-        free(temp_b);
-        return nullptr;
-    }
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    npy_intp result_dims[2] = {m, n};
-    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
-    if (!result_dtype) {
-        free(temp_a);
-        free(temp_b);
-        return nullptr;
-    }
-
-    PyArrayObject *result =
-            (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
-    if (!result) {
-        free(temp_a);
-        free(temp_b);
-        Py_DECREF(result_dtype);
-        return nullptr;
-    }
-
-    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
-
-    // Initialize result matrix to zero
-    for (npy_intp i = 0; i < m * n; i++) {
-        result_data[i] = Sleef_cast_from_doubleq1(0.0);
-    }
-
-    // Simple matrix-matrix multiplication: C[i,j] = sum(A[i,l] * B[l,j])
-    for (npy_intp i = 0; i < m; i++) {
-        for (npy_intp j = 0; j < n; j++) {
-            Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);
-            for (npy_intp l = 0; l < k; l++) {
-                // Row-major: A[i,l] = sleef_a[i*k + l], B[l,j] = sleef_b[l*n + j]
-                sum = Sleef_fmaq1_u05(sleef_a[i * k + l], sleef_b[l * n + j], sum);
-            }
-            result_data[i * n + j] = sum;
+        else {
+            return -1;  // Invalid layout
         }
-    }
 
-    // Convert to longdouble if needed
-    if (result_backend == BACKEND_LONGDOUBLE) {
-        long double *ld_result = (long double *)PyArray_DATA(result);
-        for (npy_intp i = 0; i < m * n; i++) {
-            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+        // Handle transpose (swap dimensions for transpose)
+        size_t actual_m = m, actual_n = n;
+        if (trans == 'T' || trans == 't' || trans == 'C' || trans == 'c') {
+            std::swap(actual_m, actual_n);
+            // For transpose, we need to adjust the layout
+            if (qblas_layout == QuadBLAS::Layout::RowMajor) {
+                qblas_layout = QuadBLAS::Layout::ColMajor;
+            }
+            else {
+                qblas_layout = QuadBLAS::Layout::RowMajor;
+            }
         }
-    }
 
-    free(temp_a);
-    free(temp_b);
+        // Call QBLAS GEMV
+        QuadBLAS::gemv(qblas_layout, actual_m, actual_n, *alpha, A, lda, x, incx, *beta, y, incy);
 
-    return (PyObject *)result;
-}
-
-PyObject *
-py_quadblas_dot(PyObject *self, PyObject *args)
-{
-    PyObject *a_obj, *b_obj;
-
-    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
-        return nullptr;
-    }
-
-    PyArrayObject *a = (PyArrayObject *)PyArray_FROM_OF(a_obj, NPY_ARRAY_ALIGNED);
-    PyArrayObject *b = (PyArrayObject *)PyArray_FROM_OF(b_obj, NPY_ARRAY_ALIGNED);
-
-    if (!a || !b) {
-        Py_XDECREF(a);
-        Py_XDECREF(b);
-        PyErr_SetString(PyExc_TypeError, "Inputs must be convertible to arrays");
-        return nullptr;
-    }
-
-    PyObject *result = nullptr;
-
-    int ndim_a = PyArray_NDIM(a);
-    int ndim_b = PyArray_NDIM(b);
-
-    if (ndim_a == 1 && ndim_b == 1) {
-        result = dot_vector_vector_fallback(a, b);
-    }
-    else if (ndim_a == 2 && ndim_b == 1) {
-        result = dot_matrix_vector_fallback(a, b);
+        return 0;
     }
-    else if (ndim_a == 2 && ndim_b == 2) {
-        result = dot_matrix_matrix_fallback(a, b);
+    catch (...) {
+        return -1;
     }
-    else if (ndim_a == 1 && ndim_b == 2) {
-        PyErr_SetString(PyExc_ValueError,
-                        "Vector-Matrix multiplication not supported (use Matrix-Vector instead)");
-    }
-    else {
-        PyErr_SetString(PyExc_ValueError,
-                        "Unsupported array dimensions. Supported: (1D,1D), (2D,1D), (2D,2D)");
-    }
-
-    Py_DECREF(a);
-    Py_DECREF(b);
-
-    return result;
 }
 
-// Dummy implementations for other QuadBLAS functions
-PyObject *
-py_quadblas_set_num_threads(PyObject *self, PyObject *args)
-{
-    // On Windows fallback, just ignore thread setting
-    Py_RETURN_NONE;
-}
-
-PyObject *
-py_quadblas_get_num_threads(PyObject *self, PyObject *args)
+int
+qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha,
+           Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C,
+           size_t ldc)
 {
-    // Return 1 for fallback implementation
-    return PyLong_FromLong(1);
-}
-
-PyObject *
-py_quadblas_get_version(PyObject *self, PyObject *args)
-{
-    return PyUnicode_FromString("QuadBLAS is disabled for MSVC");
-}
-
-#else
-
-static QuadBLAS::Layout
-get_quadblas_layout(PyArrayObject *arr)
-{
-    if (PyArray_IS_C_CONTIGUOUS(arr)) {
-        return QuadBLAS::Layout::RowMajor;
-    }
-    else {
-        return QuadBLAS::Layout::ColMajor;
-    }
-}
-
-static bool
-extract_quad_array_info(PyArrayObject *arr, Sleef_quad **data, QuadBackendType *backend,
-                        QuadBLAS::Layout *layout)
-{
-    if (!PyArray_Check(arr)) {
-        PyErr_SetString(PyExc_TypeError, "Expected numpy array");
-        return false;
-    }
-
-    PyArray_Descr *descr = PyArray_DESCR(arr);
-    if (!PyObject_TypeCheck(descr, (PyTypeObject *)&QuadPrecDType)) {
-        PyErr_SetString(PyExc_TypeError, "Array must have QuadPrecDType dtype");
-        return false;
-    }
-
-    QuadPrecDTypeObject *quad_descr = (QuadPrecDTypeObject *)descr;
-    *backend = quad_descr->backend;
-    *data = (Sleef_quad *)PyArray_DATA(arr);
-    *layout = get_quadblas_layout(arr);
-
-    return true;
-}
-
-static Sleef_quad *
-ensure_sleef_backend(PyArrayObject *arr, QuadBackendType original_backend,
-                     Sleef_quad **temp_storage)
-{
-    if (original_backend == BACKEND_SLEEF) {
-        *temp_storage = nullptr;
-        return (Sleef_quad *)PyArray_DATA(arr);
-    }
-
-    npy_intp size = PyArray_SIZE(arr);
-    *temp_storage = QuadBLAS::aligned_alloc<Sleef_quad>(size);
-    if (!*temp_storage) {
-        PyErr_NoMemory();
-        return nullptr;
-    }
-
-    long double *ld_data = (long double *)PyArray_DATA(arr);
-    for (npy_intp i = 0; i < size; i++) {
-        (*temp_storage)[i] = Sleef_cast_from_doubleq1((double)ld_data[i]);
-    }
-
-    return *temp_storage;
-}
-
-static PyObject *
-dot_vector_vector(PyArrayObject *a, PyArrayObject *b)
-{
-    if (PyArray_NDIM(a) != 1 || PyArray_NDIM(b) != 1) {
-        PyErr_SetString(PyExc_ValueError, "Both inputs must be 1-dimensional arrays");
-        return nullptr;
-    }
-
-    npy_intp n_a = PyArray_DIM(a, 0);
-    npy_intp n_b = PyArray_DIM(b, 0);
-
-    if (n_a != n_b) {
-        PyErr_SetString(PyExc_ValueError, "Arrays must have the same length");
-        return nullptr;
-    }
-
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-    QuadBLAS::Layout layout_a, layout_b;
-
-    if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
-        !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    npy_intp stride_a = PyArray_STRIDE(a, 0) / PyArray_ITEMSIZE(a);
-    npy_intp stride_b = PyArray_STRIDE(b, 0) / PyArray_ITEMSIZE(b);
-
-    Sleef_quad result = QuadBLAS::dot(n_a, sleef_a, stride_a, sleef_b, stride_b);
-
-    QuadBLAS::aligned_free(temp_a);
-    QuadBLAS::aligned_free(temp_b);
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    QuadPrecisionObject *result_obj = QuadPrecision_raw_new(result_backend);
-    if (!result_obj) {
-        return nullptr;
-    }
-
-    if (result_backend == BACKEND_SLEEF) {
-        result_obj->value.sleef_value = result;
-    }
-    else {
-        result_obj->value.longdouble_value = (long double)Sleef_cast_to_doubleq1(result);
-    }
-
-    return (PyObject *)result_obj;
-}
-
-static PyObject *
-dot_matrix_vector(PyArrayObject *a, PyArrayObject *b)
-{
-    if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 1) {
-        PyErr_SetString(PyExc_ValueError, "First input must be 2D, second input must be 1D");
-        return nullptr;
-    }
-
-    npy_intp m = PyArray_DIM(a, 0);
-    npy_intp n = PyArray_DIM(a, 1);
-    npy_intp n_b = PyArray_DIM(b, 0);
-
-    if (n != n_b) {
-        PyErr_SetString(PyExc_ValueError, "Matrix columns must match vector length");
-        return nullptr;
-    }
-
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-    QuadBLAS::Layout layout_a, layout_b;
-
-    if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
-        !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    npy_intp result_dims[1] = {m};
-    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
-    if (!result_dtype) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    PyArrayObject *result =
-            (PyArrayObject *)PyArray_Empty(1, result_dims, (PyArray_Descr *)result_dtype, 0);
-    if (!result) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        Py_DECREF(result_dtype);
-        return nullptr;
+    if (!alpha || !A || !B || !beta || !C || m == 0 || n == 0 || k == 0) {
+        return -1;
     }
 
-    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
-
-    npy_intp lda;
-    if (layout_a == QuadBLAS::Layout::RowMajor) {
-        lda = n;
-    }
-    else {
-        lda = m;
-    }
-
-    npy_intp stride_b = PyArray_STRIDE(b, 0) / PyArray_ITEMSIZE(b);
-    npy_intp stride_result = PyArray_STRIDE(result, 0) / PyArray_ITEMSIZE(result);
-
-    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
-    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
-
-    QuadBLAS::gemv(layout_a, m, n, alpha, sleef_a, lda, sleef_b, stride_b, beta, result_data,
-                   stride_result);
-
-    if (result_backend == BACKEND_LONGDOUBLE) {
-        long double *ld_result = (long double *)PyArray_DATA(result);
-        for (npy_intp i = 0; i < m; i++) {
-            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+    try {
+        // Convert layout
+        QuadBLAS::Layout qblas_layout;
+        if (layout == 'R' || layout == 'r') {
+            qblas_layout = QuadBLAS::Layout::RowMajor;
+        }
+        else if (layout == 'C' || layout == 'c') {
+            qblas_layout = QuadBLAS::Layout::ColMajor;
+        }
+        else {
+            return -1;  // Invalid layout
         }
-    }
-
-    QuadBLAS::aligned_free(temp_a);
-    QuadBLAS::aligned_free(temp_b);
-
-    return (PyObject *)result;
-}
-
-static PyObject *
-dot_matrix_matrix(PyArrayObject *a, PyArrayObject *b)
-{
-    if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 2) {
-        PyErr_SetString(PyExc_ValueError, "Both inputs must be 2-dimensional arrays");
-        return nullptr;
-    }
-
-    npy_intp m = PyArray_DIM(a, 0);
-    npy_intp k = PyArray_DIM(a, 1);
-    npy_intp k_b = PyArray_DIM(b, 0);
-    npy_intp n = PyArray_DIM(b, 1);
-
-    if (k != k_b) {
-        PyErr_SetString(PyExc_ValueError, "Matrix inner dimensions must match");
-        return nullptr;
-    }
-
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-    QuadBLAS::Layout layout_a, layout_b;
-
-    if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
-        !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    npy_intp result_dims[2] = {m, n};
-    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
-    if (!result_dtype) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    PyArrayObject *result =
-            (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
-    if (!result) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        Py_DECREF(result_dtype);
-        return nullptr;
-    }
-
-    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
-    for (npy_intp i = 0; i < m * n; i++) {
-        result_data[i] = Sleef_cast_from_doubleq1(0.0);
-    }
-
-    npy_intp lda, ldb, ldc;
 
-    if (layout_a == QuadBLAS::Layout::RowMajor) {
-        lda = k;
-    }
-    else {
-        lda = m;
-    }
+        // For now, we only support no transpose
+        // TODO: Implement transpose support if needed
+        if ((transa != 'N' && transa != 'n') || (transb != 'N' && transb != 'n')) {
+            return -1;  // Transpose not implemented yet
+        }
 
-    if (layout_b == QuadBLAS::Layout::RowMajor) {
-        ldb = n;
-    }
-    else {
-        ldb = k;
-    }
+        // Call QBLAS GEMM
+        QuadBLAS::gemm(qblas_layout, m, n, k, *alpha, A, lda, B, ldb, *beta, C, ldc);
 
-    QuadBLAS::Layout result_layout = layout_a;
-    if (result_layout == QuadBLAS::Layout::RowMajor) {
-        ldc = n;
-    }
-    else {
-        ldc = m;
+        return 0;
     }
-
-    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
-    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
-
-    QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data,
-                   ldc);
-
-    if (result_backend == BACKEND_LONGDOUBLE) {
-        long double *ld_result = (long double *)PyArray_DATA(result);
-        for (npy_intp i = 0; i < m * n; i++) {
-            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
-        }
+    catch (...) {
+        return -1;
     }
-
-    QuadBLAS::aligned_free(temp_a);
-    QuadBLAS::aligned_free(temp_b);
-
-    return (PyObject *)result;
 }
 
-PyObject *
-py_quadblas_dot(PyObject *self, PyObject *args)
+int
+qblas_supports_backend(QuadBackendType backend)
 {
-    PyObject *a_obj, *b_obj;
-
-    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
-        return nullptr;
-    }
-
-    PyArrayObject *a = (PyArrayObject *)PyArray_FROM_OF(a_obj, NPY_ARRAY_ALIGNED);
-    PyArrayObject *b = (PyArrayObject *)PyArray_FROM_OF(b_obj, NPY_ARRAY_ALIGNED);
-
-    if (!a || !b) {
-        Py_XDECREF(a);
-        Py_XDECREF(b);
-        PyErr_SetString(PyExc_TypeError, "Inputs must be convertible to arrays");
-        return nullptr;
-    }
-
-    PyObject *result = nullptr;
-
-    int ndim_a = PyArray_NDIM(a);
-    int ndim_b = PyArray_NDIM(b);
-
-    if (ndim_a == 1 && ndim_b == 1) {
-        result = dot_vector_vector(a, b);
-    }
-    else if (ndim_a == 2 && ndim_b == 1) {
-        result = dot_matrix_vector(a, b);
-    }
-    else if (ndim_a == 2 && ndim_b == 2) {
-        result = dot_matrix_matrix(a, b);
-    }
-    else if (ndim_a == 1 && ndim_b == 2) {
-        PyErr_SetString(PyExc_ValueError,
-                        "Vector-Matrix multiplication not supported (use Matrix-Vector instead)");
-    }
-    else {
-        PyErr_SetString(PyExc_ValueError,
-                        "Unsupported array dimensions. Supported: (1D,1D), (2D,1D), (2D,2D)");
-    }
-
-    Py_DECREF(a);
-    Py_DECREF(b);
-
-    return result;
+    // QBLAS only supports SLEEF backend
+    return (backend == BACKEND_SLEEF) ? 1 : 0;
 }
 
 PyObject *
 py_quadblas_set_num_threads(PyObject *self, PyObject *args)
 {
     int num_threads;
-
     if (!PyArg_ParseTuple(args, "i", &num_threads)) {
-        return nullptr;
+        return NULL;
     }
 
-    if (num_threads < 1) {
+    if (num_threads <= 0) {
         PyErr_SetString(PyExc_ValueError, "Number of threads must be positive");
-        return nullptr;
+        return NULL;
     }
 
     QuadBLAS::set_num_threads(num_threads);
@@ -777,13 +130,14 @@ py_quadblas_set_num_threads(PyObject *self, PyObject *args)
 PyObject *
 py_quadblas_get_num_threads(PyObject *self, PyObject *args)
 {
-    return PyLong_FromLong(QuadBLAS::get_num_threads());
+    int num_threads = QuadBLAS::get_num_threads();
+    return PyLong_FromLong(num_threads);
 }
 
 PyObject *
 py_quadblas_get_version(PyObject *self, PyObject *args)
 {
-    return PyUnicode_FromString(QuadBLAS::VERSION);
+    return PyUnicode_FromString("QuadBLAS 1.0.0 - High Performance Quad Precision BLAS");
 }
 
-#endif  // DISABLE_QUADBLAS
\ No newline at end of file
+}  // extern "C"
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.h b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
index da8f0a85..ff9ed537 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.h
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
@@ -1,23 +1,39 @@
-#ifndef _QUADDTYPE_QUADBLAS_INTERFACE_H
-#define _QUADDTYPE_QUADBLAS_INTERFACE_H
+#ifndef QUADBLAS_INTERFACE_H
+#define QUADBLAS_INTERFACE_H
+
+#include <stddef.h>
+#include <Python.h>
+#include "quad_common.h"
+#include <sleefquad.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include <Python.h>
-
+int
+qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result);
 
-PyObject* py_quadblas_dot(PyObject* self, PyObject* args);
+int
+qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A,
+           size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy);
 
+int
+qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha,
+           Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C,
+           size_t ldc);
 
-PyObject* py_quadblas_set_num_threads(PyObject* self, PyObject* args);
-PyObject* py_quadblas_get_num_threads(PyObject* self, PyObject* args);
+int
+qblas_supports_backend(QuadBackendType backend);
 
-PyObject* py_quadblas_get_version(PyObject* self, PyObject* args);
+PyObject *
+py_quadblas_set_num_threads(PyObject *self, PyObject *args);
+PyObject *
+py_quadblas_get_num_threads(PyObject *self, PyObject *args);
+PyObject *
+py_quadblas_get_version(PyObject *self, PyObject *args);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif 
\ No newline at end of file
+#endif  // QUADBLAS_INTERFACE_H
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index 00cc8589..d1adc7f3 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -22,43 +22,36 @@ extern "C" {
 #include "../ops.hpp"
 #include "matmul.h"
 #include "promoters.hpp"
+#include "../quadblas_interface.h"
 
 /**
  * Resolve descriptors for matmul operation.
- * Follows the same pattern as binary_ops.cpp
+ * Only supports SLEEF backend when QBLAS is enabled.
  */
 static NPY_CASTING
 quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
                                 PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
                                 npy_intp *NPY_UNUSED(view_offset))
 {
-    // Follow the exact same pattern as quad_binary_op_resolve_descriptors
     QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
     QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
-    QuadBackendType target_backend;
 
-    // Determine target backend and if casting is needed
-    NPY_CASTING casting = NPY_NO_CASTING;
-    if (descr_in1->backend != descr_in2->backend) {
-        target_backend = BACKEND_LONGDOUBLE;
-        casting = NPY_SAFE_CASTING;
-    }
-    else {
-        target_backend = descr_in1->backend;
+    // QBLAS only supports SLEEF backend
+    if (descr_in1->backend != BACKEND_SLEEF || descr_in2->backend != BACKEND_SLEEF) {
+        PyErr_SetString(PyExc_NotImplementedError,
+                        "QBLAS-accelerated matmul only supports SLEEF backend. "
+                        "Other backends are not supported with QBLAS.");
+        return (NPY_CASTING)-1;
     }
 
-    // Set up input descriptors, casting if necessary
+    // Both inputs must use SLEEF backend
+    QuadBackendType target_backend = BACKEND_SLEEF;
+    NPY_CASTING casting = NPY_NO_CASTING;
+
+    // Set up input descriptors
     for (int i = 0; i < 2; i++) {
-        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
-            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
-            if (!loop_descrs[i]) {
-                return (NPY_CASTING)-1;
-            }
-        }
-        else {
-            Py_INCREF(given_descrs[i]);
-            loop_descrs[i] = given_descrs[i];
-        }
+        Py_INCREF(given_descrs[i]);
+        loop_descrs[i] = given_descrs[i];
     }
 
     // Set up output descriptor
@@ -71,10 +64,9 @@ quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[
     else {
         QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2];
         if (descr_out->backend != target_backend) {
-            loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
-            if (!loop_descrs[2]) {
-                return (NPY_CASTING)-1;
-            }
+            PyErr_SetString(PyExc_NotImplementedError,
+                            "QBLAS-accelerated matmul only supports SLEEF backend for output.");
+            return (NPY_CASTING)-1;
         }
         else {
             Py_INCREF(given_descrs[2]);
@@ -85,117 +77,166 @@ quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[
 }
 
 /**
- * Matrix multiplication strided loop using NumPy 2.0 API.
- * Implements general matrix multiplication for arbitrary dimensions.
- *
- * For matmul with signature (m?,n),(n,p?)->(m?,p?):
- * - dimensions[0] = N (loop dimension, number of batch operations)
- * - dimensions[1] = m (rows of first matrix)
- * - dimensions[2] = n (cols of first matrix / rows of second matrix)
- * - dimensions[3] = p (cols of second matrix)
- *
- * - strides[0], strides[1], strides[2] = batch strides for A, B, C
- * - strides[3], strides[4] = row stride, col stride for A (m, n)
- * - strides[5], strides[6] = row stride, col stride for B (n, p)
- * - strides[7], strides[8] = row stride, col stride for C (m, p)
+ * Determine the type of operation based on input dimensions
+ */
+enum MatmulOperationType {
+    MATMUL_DOT,   // 1D x 1D -> scalar
+    MATMUL_GEMV,  // 2D x 1D -> 1D
+    MATMUL_GEMM   // 2D x 2D -> 2D
+};
+
+static MatmulOperationType
+determine_operation_type(npy_intp m, npy_intp n, npy_intp p)
+{
+    // For matmul signature (m?,n),(n,p?)->(m?,p?):
+    // - If m=1 and p=1: vector dot product (1D x 1D)
+    // - If p=1: matrix-vector multiplication (2D x 1D)
+    // - Otherwise: matrix-matrix multiplication (2D x 2D)
+
+    if (m == 1 && p == 1) {
+        return MATMUL_DOT;
+    }
+    else if (p == 1) {
+        return MATMUL_GEMV;
+    }
+    else {
+        return MATMUL_GEMM;
+    }
+}
+
+/**
+ * Matrix multiplication strided loop using QBLAS.
+ * Automatically selects the appropriate QBLAS operation based on input dimensions.
  */
 static int
 quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
                          npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata)
 {
     // Extract dimensions
-    npy_intp N = dimensions[0];  // Number of batch operations
+    npy_intp N = dimensions[0];  // Batch size, this remains always 1 for matmul afaik
     npy_intp m = dimensions[1];  // Rows of first matrix
     npy_intp n = dimensions[2];  // Cols of first matrix / rows of second matrix
     npy_intp p = dimensions[3];  // Cols of second matrix
 
     // Extract batch strides
-    npy_intp A_batch_stride = strides[0];
-    npy_intp B_batch_stride = strides[1];
-    npy_intp C_batch_stride = strides[2];
+    npy_intp A_stride = strides[0];
+    npy_intp B_stride = strides[1];
+    npy_intp C_stride = strides[2];
 
     // Extract core strides for matrix dimensions
-    npy_intp A_row_stride = strides[3];  // Stride along m dimension of A
-    npy_intp A_col_stride = strides[4];  // Stride along n dimension of A
-    npy_intp B_row_stride = strides[5];  // Stride along n dimension of B
-    npy_intp B_col_stride = strides[6];  // Stride along p dimension of B
-    npy_intp C_row_stride = strides[7];  // Stride along m dimension of C
-    npy_intp C_col_stride = strides[8];  // Stride along p dimension of C
-
-    // Get backend from descriptor
+    npy_intp A_row_stride = strides[3];
+    npy_intp A_col_stride = strides[4];
+    npy_intp B_row_stride = strides[5];
+    npy_intp B_col_stride = strides[6];
+    npy_intp C_row_stride = strides[7];
+    npy_intp C_col_stride = strides[8];
+
+    // Note: B_col_stride and C_col_stride not needed for row-major QBLAS calls
+
+    // Get backend from descriptor (should be SLEEF only)
     QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
-
-    // Process each batch
-    for (npy_intp batch = 0; batch < N; batch++) {
-        char *A_batch = data[0] + batch * A_batch_stride;
-        char *B_batch = data[1] + batch * B_batch_stride;
-        char *C_batch = data[2] + batch * C_batch_stride;
-
-        // Perform matrix multiplication: C = A @ B
-        // C[i,j] = sum_k(A[i,k] * B[k,j])
-        for (npy_intp i = 0; i < m; i++) {
-            for (npy_intp j = 0; j < p; j++) {
-                char *C_ij = C_batch + i * C_row_stride + j * C_col_stride;
-
-                if (backend == BACKEND_SLEEF) {
-                    Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);  // Initialize to 0
-
-                    for (npy_intp k = 0; k < n; k++) {
-                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
-                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
-
-                        Sleef_quad a_val, b_val;
-                        memcpy(&a_val, A_ik, sizeof(Sleef_quad));
-                        memcpy(&b_val, B_kj, sizeof(Sleef_quad));
-
-                        // sum += A[i,k] * B[k,j]
-                        sum = Sleef_addq1_u05(sum, Sleef_mulq1_u05(a_val, b_val));
-                    }
-
-                    memcpy(C_ij, &sum, sizeof(Sleef_quad));
-                }
-                else {
-                    // Long double backend
-                    long double sum = 0.0L;
-
-                    for (npy_intp k = 0; k < n; k++) {
-                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
-                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
-
-                        long double a_val, b_val;
-                        memcpy(&a_val, A_ik, sizeof(long double));
-                        memcpy(&b_val, B_kj, sizeof(long double));
-
-                        sum += a_val * b_val;
-                    }
-
-                    memcpy(C_ij, &sum, sizeof(long double));
-                }
-            }
+    if (descr->backend != BACKEND_SLEEF) {
+        PyErr_SetString(PyExc_RuntimeError, "Internal error: non-SLEEF backend in QBLAS matmul");
+        return -1;
+    }
+
+    // Determine operation type
+    MatmulOperationType op_type = determine_operation_type(m, n, p);
+
+    // Constants for QBLAS
+    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
+    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
+
+    // print all information for debugging
+    printf("DEBUG: Performing %ld batch operations with dimensions (%ld, %ld, %ld)\n", (long)N,
+           (long)m, (long)n, (long)p);
+    printf("DEBUG: Strides - A: (%ld, %ld), B: (%ld, %ld), C: (%ld, %ld)\n", (long)A_row_stride,
+           (long)A_col_stride, (long)B_row_stride, (long)B_col_stride, (long)C_row_stride,
+           (long)C_col_stride);
+    printf("DEBUG: Operation type: %d\n", op_type);
+
+    char *A = data[0];
+    char *B = data[1];
+    char *C = data[2];
+
+    Sleef_quad *A_ptr = (Sleef_quad *)A;
+    Sleef_quad *B_ptr = (Sleef_quad *)B;
+    Sleef_quad *C_ptr = (Sleef_quad *)C;
+
+    int result = -1;
+
+    switch (op_type) {
+        case MATMUL_DOT: {
+            // Vector dot product: C = A^T * B (both are vectors)
+            // A has shape (1, n), B has shape (n, 1), C has shape (1, 1)
+
+            printf("DEBUG: Using QBLAS dot product for %ld elements\n", (long)n);
+
+            // A is effectively a vector of length n
+            // B is effectively a vector of length n
+            size_t incx = A_col_stride / sizeof(Sleef_quad);
+            size_t incy = B_row_stride / sizeof(Sleef_quad);
+
+            result = qblas_dot(n, A_ptr, incx, B_ptr, incy, C_ptr);
+            break;
         }
+
+        case MATMUL_GEMV: {
+            // Matrix-vector multiplication: C = A * B
+            // A has shape (m, n), B has shape (n, 1), C has shape (m, 1)
+
+            printf("DEBUG: Using QBLAS GEMV for %ldx%ld matrix times %ld vector\n", (long)m,
+                   (long)n, (long)n);
+
+            size_t lda = A_row_stride / sizeof(Sleef_quad);
+            size_t incx = B_row_stride / sizeof(Sleef_quad);
+            size_t incy = C_row_stride / sizeof(Sleef_quad);
+
+            result =
+                    qblas_gemv('R', 'N', m, n, &alpha, A_ptr, lda, B_ptr, incx, &beta, C_ptr, incy);
+            break;
+        }
+
+        case MATMUL_GEMM: {
+            // Matrix-matrix multiplication: C = A * B
+            // A has shape (m, n), B has shape (n, p), C has shape (m, p)
+
+            printf("DEBUG: Using QBLAS GEMM for %ldx%ldx%ld matrices\n", (long)m, (long)n, (long)p);
+
+            size_t lda = A_row_stride / sizeof(Sleef_quad);
+            size_t ldb = B_row_stride / sizeof(Sleef_quad);
+            size_t ldc = C_row_stride / sizeof(Sleef_quad);
+
+            result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, &beta,
+                                C_ptr, ldc);
+            break;
+        }
+    }
+
+    if (result != 0) {
+        PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed");
+        return -1;
     }
 
     return 0;
 }
 
 /**
- * Register matmul support following the exact same pattern as binary_ops.cpp
+ * Register matmul support with QBLAS acceleration
  */
 int
 init_matmul_ops(PyObject *numpy)
 {
-    printf("DEBUG: init_matmul_ops - registering matmul using NumPy 2.0 API\n");
+    printf("DEBUG: init_matmul_ops - registering QBLAS-accelerated matmul\n");
 
-    // Get the existing matmul ufunc - same pattern as binary_ops
+    // Get the existing matmul ufunc
     PyObject *ufunc = PyObject_GetAttrString(numpy, "matmul");
     if (ufunc == NULL) {
         printf("DEBUG: Failed to get numpy.matmul\n");
         return -1;
     }
 
-    // Use the same pattern as binary_ops.cpp
+    // Setup method specification for QBLAS-accelerated matmul
     PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
 
     PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
@@ -204,7 +245,7 @@ init_matmul_ops(PyObject *numpy)
                            {0, NULL}};
 
     PyArrayMethod_Spec Spec = {
-            .name = "quad_matmul",
+            .name = "quad_matmul_qblas",
             .nin = 2,
             .nout = 1,
             .casting = NPY_NO_CASTING,
@@ -213,17 +254,17 @@ init_matmul_ops(PyObject *numpy)
             .slots = slots,
     };
 
-    printf("DEBUG: About to add loop to matmul ufunc...\n");
+    printf("DEBUG: About to add QBLAS loop to matmul ufunc...\n");
 
     if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
-        printf("DEBUG: Failed to add loop to matmul ufunc\n");
+        printf("DEBUG: Failed to add QBLAS loop to matmul ufunc\n");
         Py_DECREF(ufunc);
         return -1;
     }
 
-    printf("DEBUG: Successfully added matmul loop!\n");
+    printf("DEBUG: Successfully added QBLAS matmul loop!\n");
 
-    // Add promoter following binary_ops pattern
+    // Add promoter
     PyObject *promoter_capsule =
             PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL);
     if (promoter_capsule == NULL) {
@@ -250,6 +291,6 @@ init_matmul_ops(PyObject *numpy)
     Py_DECREF(promoter_capsule);
     Py_DECREF(ufunc);
 
-    printf("DEBUG: init_matmul_ops completed successfully\n");
+    printf("DEBUG: init_matmul_ops completed successfully with QBLAS acceleration\n");
     return 0;
 }
\ No newline at end of file

From 742ce642aa173487a8253da10518e30419621ec3 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 17:38:59 +0530
Subject: [PATCH 13/20] matmul ufunc completed, naive plugged, qblas
 experimental

---
 quaddtype/numpy_quaddtype/QBLAS               |   2 +-
 .../src/quadblas_interface.cpp                |   2 -
 .../numpy_quaddtype/src/umath/matmul.cpp      | 189 +++++++++++-------
 3 files changed, 116 insertions(+), 77 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS
index 4853ac1c..9468e24a 160000
--- a/quaddtype/numpy_quaddtype/QBLAS
+++ b/quaddtype/numpy_quaddtype/QBLAS
@@ -1 +1 @@
-Subproject commit 4853ac1c7d3fa3016b61e9f2b9a43f49c06d891d
+Subproject commit 9468e24a02b731563eba2aee0350e9219b36c102
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index 185e2d87..7ef618fd 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -75,7 +75,6 @@ qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k,
     }
 
     try {
-        // Convert layout
         QuadBLAS::Layout qblas_layout;
         if (layout == 'R' || layout == 'r') {
             qblas_layout = QuadBLAS::Layout::RowMajor;
@@ -93,7 +92,6 @@ qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k,
             return -1;  // Transpose not implemented yet
         }
 
-        // Call QBLAS GEMM
         QuadBLAS::gemm(qblas_layout, m, n, k, *alpha, A, lda, B, ldb, *beta, C, ldc);
 
         return 0;
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index d1adc7f3..e192a670 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -24,10 +24,6 @@ extern "C" {
 #include "promoters.hpp"
 #include "../quadblas_interface.h"
 
-/**
- * Resolve descriptors for matmul operation.
- * Only supports SLEEF backend when QBLAS is enabled.
- */
 static NPY_CASTING
 quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
                                 PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
@@ -76,23 +72,15 @@ quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[
     return casting;
 }
 
-/**
- * Determine the type of operation based on input dimensions
- */
 enum MatmulOperationType {
-    MATMUL_DOT,   // 1D x 1D -> scalar
-    MATMUL_GEMV,  // 2D x 1D -> 1D
-    MATMUL_GEMM   // 2D x 2D -> 2D
+    MATMUL_DOT,
+    MATMUL_GEMV,
+    MATMUL_GEMM
 };
 
 static MatmulOperationType
 determine_operation_type(npy_intp m, npy_intp n, npy_intp p)
 {
-    // For matmul signature (m?,n),(n,p?)->(m?,p?):
-    // - If m=1 and p=1: vector dot product (1D x 1D)
-    // - If p=1: matrix-vector multiplication (2D x 1D)
-    // - Otherwise: matrix-matrix multiplication (2D x 2D)
-
     if (m == 1 && p == 1) {
         return MATMUL_DOT;
     }
@@ -104,10 +92,6 @@ determine_operation_type(npy_intp m, npy_intp n, npy_intp p)
     }
 }
 
-/**
- * Matrix multiplication strided loop using QBLAS.
- * Automatically selects the appropriate QBLAS operation based on input dimensions.
- */
 static int
 quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
                          npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata)
@@ -118,12 +102,12 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
     npy_intp n = dimensions[2];  // Cols of first matrix / rows of second matrix
     npy_intp p = dimensions[3];  // Cols of second matrix
 
-    // Extract batch strides
+    // batch strides
     npy_intp A_stride = strides[0];
     npy_intp B_stride = strides[1];
     npy_intp C_stride = strides[2];
 
-    // Extract core strides for matrix dimensions
+    // core strides for matrix dimensions
     npy_intp A_row_stride = strides[3];
     npy_intp A_col_stride = strides[4];
     npy_intp B_row_stride = strides[5];
@@ -131,30 +115,16 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
     npy_intp C_row_stride = strides[7];
     npy_intp C_col_stride = strides[8];
 
-    // Note: B_col_stride and C_col_stride not needed for row-major QBLAS calls
-
-    // Get backend from descriptor (should be SLEEF only)
     QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
     if (descr->backend != BACKEND_SLEEF) {
         PyErr_SetString(PyExc_RuntimeError, "Internal error: non-SLEEF backend in QBLAS matmul");
         return -1;
     }
 
-    // Determine operation type
     MatmulOperationType op_type = determine_operation_type(m, n, p);
-
-    // Constants for QBLAS
     Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
     Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
 
-    // print all information for debugging
-    printf("DEBUG: Performing %ld batch operations with dimensions (%ld, %ld, %ld)\n", (long)N,
-           (long)m, (long)n, (long)p);
-    printf("DEBUG: Strides - A: (%ld, %ld), B: (%ld, %ld), C: (%ld, %ld)\n", (long)A_row_stride,
-           (long)A_col_stride, (long)B_row_stride, (long)B_col_stride, (long)C_row_stride,
-           (long)C_col_stride);
-    printf("DEBUG: Operation type: %d\n", op_type);
-
     char *A = data[0];
     char *B = data[1];
     char *C = data[2];
@@ -167,13 +137,6 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
 
     switch (op_type) {
         case MATMUL_DOT: {
-            // Vector dot product: C = A^T * B (both are vectors)
-            // A has shape (1, n), B has shape (n, 1), C has shape (1, 1)
-
-            printf("DEBUG: Using QBLAS dot product for %ld elements\n", (long)n);
-
-            // A is effectively a vector of length n
-            // B is effectively a vector of length n
             size_t incx = A_col_stride / sizeof(Sleef_quad);
             size_t incy = B_row_stride / sizeof(Sleef_quad);
 
@@ -182,12 +145,6 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
         }
 
         case MATMUL_GEMV: {
-            // Matrix-vector multiplication: C = A * B
-            // A has shape (m, n), B has shape (n, 1), C has shape (m, 1)
-
-            printf("DEBUG: Using QBLAS GEMV for %ldx%ld matrix times %ld vector\n", (long)m,
-                   (long)n, (long)n);
-
             size_t lda = A_row_stride / sizeof(Sleef_quad);
             size_t incx = B_row_stride / sizeof(Sleef_quad);
             size_t incy = C_row_stride / sizeof(Sleef_quad);
@@ -198,17 +155,46 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
         }
 
         case MATMUL_GEMM: {
-            // Matrix-matrix multiplication: C = A * B
-            // A has shape (m, n), B has shape (n, p), C has shape (m, p)
-
-            printf("DEBUG: Using QBLAS GEMM for %ldx%ldx%ld matrices\n", (long)m, (long)n, (long)p);
-
             size_t lda = A_row_stride / sizeof(Sleef_quad);
             size_t ldb = B_row_stride / sizeof(Sleef_quad);
-            size_t ldc = C_row_stride / sizeof(Sleef_quad);
+            size_t ldc_numpy = C_row_stride / sizeof(Sleef_quad);
+
+            Sleef_quad *temp_A_buffer = new Sleef_quad[m * n];
+            if (!temp_A_buffer) {
+                PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffer for GEMM");
+                delete[] temp_A_buffer;
+                return -1;
+            }
+            Sleef_quad *temp_B_buffer = new Sleef_quad[n * p];
+            if (!temp_B_buffer) {
+                PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffer for GEMM");
+                delete[] temp_A_buffer;
+                return -1;
+            }
+            memcpy(temp_A_buffer, A_ptr, m * n * sizeof(Sleef_quad));
+            memcpy(temp_B_buffer, B_ptr, n * p * sizeof(Sleef_quad));
+            A_ptr = temp_A_buffer;
+            B_ptr = temp_B_buffer;
+
+            Sleef_quad *temp_C_buffer = new Sleef_quad[m * p];
+            if (!temp_C_buffer) {
+                PyErr_SetString(PyExc_MemoryError,
+                                "Failed to allocate temporary buffer for GEMM result");
+                return -1;
+            }
+
+            size_t ldc_temp = p;
 
             result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, &beta,
-                                C_ptr, ldc);
+                                temp_C_buffer, ldc_temp);
+
+            if (result == 0) {
+                memcpy(C_ptr, temp_C_buffer, m * p * sizeof(Sleef_quad));
+            }
+
+            delete[] temp_C_buffer;
+            delete[] temp_A_buffer;
+            delete[] temp_B_buffer;
             break;
         }
     }
@@ -221,27 +207,91 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
     return 0;
 }
 
-/**
- * Register matmul support with QBLAS acceleration
- */
+static int
+naive_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
+                          npy_intp const dimensions[], npy_intp const strides[],
+                          NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    npy_intp m = dimensions[1];
+    npy_intp n = dimensions[2];
+    npy_intp p = dimensions[3];
+
+    npy_intp A_batch_stride = strides[0];
+    npy_intp B_batch_stride = strides[1];
+    npy_intp C_batch_stride = strides[2];
+
+    npy_intp A_row_stride = strides[3];
+    npy_intp A_col_stride = strides[4];
+    npy_intp B_row_stride = strides[5];
+    npy_intp B_col_stride = strides[6];
+    npy_intp C_row_stride = strides[7];
+    npy_intp C_col_stride = strides[8];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    for (npy_intp batch = 0; batch < N; batch++) {
+        char *A_batch = data[0] + batch * A_batch_stride;
+        char *B_batch = data[1] + batch * B_batch_stride;
+        char *C_batch = data[2] + batch * C_batch_stride;
+
+        for (npy_intp i = 0; i < m; i++) {
+            for (npy_intp j = 0; j < p; j++) {
+                char *C_ij = C_batch + i * C_row_stride + j * C_col_stride;
+
+                if (backend == BACKEND_SLEEF) {
+                    Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);
+
+                    for (npy_intp k = 0; k < n; k++) {
+                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
+                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+
+                        Sleef_quad a_val, b_val;
+                        memcpy(&a_val, A_ik, sizeof(Sleef_quad));
+                        memcpy(&b_val, B_kj, sizeof(Sleef_quad));
+                        sum = Sleef_fmaq1_u05(a_val, b_val, sum);
+                    }
+
+                    memcpy(C_ij, &sum, sizeof(Sleef_quad));
+                }
+                else {
+                    long double sum = 0.0L;
+
+                    for (npy_intp k = 0; k < n; k++) {
+                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
+                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+
+                        long double a_val, b_val;
+                        memcpy(&a_val, A_ik, sizeof(long double));
+                        memcpy(&b_val, B_kj, sizeof(long double));
+
+                        sum += a_val * b_val;
+                    }
+
+                    memcpy(C_ij, &sum, sizeof(long double));
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
 int
 init_matmul_ops(PyObject *numpy)
 {
-    printf("DEBUG: init_matmul_ops - registering QBLAS-accelerated matmul\n");
-
-    // Get the existing matmul ufunc
     PyObject *ufunc = PyObject_GetAttrString(numpy, "matmul");
     if (ufunc == NULL) {
-        printf("DEBUG: Failed to get numpy.matmul\n");
         return -1;
     }
 
-    // Setup method specification for QBLAS-accelerated matmul
     PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
 
     PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
-                           {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop},
-                           {NPY_METH_unaligned_strided_loop, (void *)&quad_matmul_strided_loop},
+                           {NPY_METH_strided_loop, (void *)&naive_matmul_strided_loop},
+                           {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},
                            {0, NULL}};
 
     PyArrayMethod_Spec Spec = {
@@ -254,17 +304,11 @@ init_matmul_ops(PyObject *numpy)
             .slots = slots,
     };
 
-    printf("DEBUG: About to add QBLAS loop to matmul ufunc...\n");
-
     if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
-        printf("DEBUG: Failed to add QBLAS loop to matmul ufunc\n");
         Py_DECREF(ufunc);
         return -1;
     }
 
-    printf("DEBUG: Successfully added QBLAS matmul loop!\n");
-
-    // Add promoter
     PyObject *promoter_capsule =
             PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL);
     if (promoter_capsule == NULL) {
@@ -280,17 +324,14 @@ init_matmul_ops(PyObject *numpy)
     }
 
     if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
-        printf("DEBUG: Failed to add promoter (continuing anyway)\n");
         PyErr_Clear();  // Don't fail if promoter fails
     }
     else {
-        printf("DEBUG: Successfully added promoter\n");
     }
 
     Py_DECREF(DTypes);
     Py_DECREF(promoter_capsule);
     Py_DECREF(ufunc);
 
-    printf("DEBUG: init_matmul_ops completed successfully with QBLAS acceleration\n");
     return 0;
 }
\ No newline at end of file

From d993bc92bb760c4e0428ac9542ed97272247a4af Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 17:47:11 +0530
Subject: [PATCH 14/20] adding release tracker to keep record for tasks, v1.0.0

---
 quaddtype/release_tracker.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/quaddtype/release_tracker.md b/quaddtype/release_tracker.md
index 1ecf7d3a..3ed10042 100644
--- a/quaddtype/release_tracker.md
+++ b/quaddtype/release_tracker.md
@@ -1,4 +1,4 @@
-# Plan for `numpy-quaddtype` v1.5
+# Plan for `numpy-quaddtype` v1.0.0
 
 | ufunc name    | Added |
 | ------------- | ----- |
@@ -91,3 +91,5 @@
 | floor         | ✅    |
 | ceil          | ✅    |
 | trunc         | ✅    |
+
+- Fixing QBLAS integration to work unaligned arrays without or recovering from bad allocation fallback

From c518a29281de14955e8125cb8845200d9a8b46a9 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 15:33:46 +0000
Subject: [PATCH 15/20] it should be failing but passes on x86-64

---
 quaddtype/numpy_quaddtype/QBLAS                | 2 +-
 quaddtype/numpy_quaddtype/src/umath/matmul.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS
index 9468e24a..0eabb677 160000
--- a/quaddtype/numpy_quaddtype/QBLAS
+++ b/quaddtype/numpy_quaddtype/QBLAS
@@ -1 +1 @@
-Subproject commit 9468e24a02b731563eba2aee0350e9219b36c102
+Subproject commit 0eabb677431c6148434c50deba7abd6902d74b16
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index e192a670..f31ec89f 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -290,7 +290,7 @@ init_matmul_ops(PyObject *numpy)
     PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
 
     PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
-                           {NPY_METH_strided_loop, (void *)&naive_matmul_strided_loop},
+                           {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop},
                            {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},
                            {0, NULL}};
 

From bbce2ac58f930ad04a89ea877ef2013f5b49c8c7 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 18:17:04 +0000
Subject: [PATCH 16/20] ahh stupid me :), fallback to naive for MSVC

---
 quaddtype/numpy_quaddtype/__init__.py         |  6 +-
 .../src/quadblas_interface.cpp                | 94 ++++++++++++++++++-
 .../numpy_quaddtype/src/quadblas_interface.h  |  5 +
 .../numpy_quaddtype/src/umath/matmul.cpp      | 11 +++
 4 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/__init__.py b/quaddtype/numpy_quaddtype/__init__.py
index 8b588c11..878180bc 100644
--- a/quaddtype/numpy_quaddtype/__init__.py
+++ b/quaddtype/numpy_quaddtype/__init__.py
@@ -39,8 +39,4 @@ def LongDoubleQuadPrecDType():
 ln10 = get_sleef_constant("ln10")
 max_value = get_sleef_constant("quad_max")
 min_value = get_sleef_constant("quad_min")
-epsilon = get_sleef_constant("epsilon")
-
-num_cores = multiprocessing.cpu_count()
-# set default number of threads for QuadBLAS
-set_num_threads(num_cores)
\ No newline at end of file
+epsilon = get_sleef_constant("epsilon")
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index 7ef618fd..6eb57574 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -1,10 +1,16 @@
 #include "quadblas_interface.h"
-#include "../QBLAS/include/quadblas/quadblas.hpp"
 #include <cstring>
 #include <algorithm>
 
+#ifndef DISABLE_QUADBLAS
+#include "../QBLAS/include/quadblas/quadblas.hpp"
+#endif // DISABLE_QUADBLAS
+
 extern "C" {
 
+
+#ifndef  DISABLE_QUADBLAS
+
 int
 qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result)
 {
@@ -138,4 +144,90 @@ py_quadblas_get_version(PyObject *self, PyObject *args)
     return PyUnicode_FromString("QuadBLAS 1.0.0 - High Performance Quad Precision BLAS");
 }
 
+int
+quadblas_set_num_threads(int num_threads)
+{
+    QuadBLAS::set_num_threads(num_threads);
+    return 0;
+}
+
+int
+quadblas_get_num_threads(void)
+{
+    int num_threads = QuadBLAS::get_num_threads();
+    return num_threads;
+}
+
+#else  // DISABLE_QUADBLAS
+
+
+int
+qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result)
+{
+    return -1;  // QBLAS is disabled, dot product not available
+}
+
+int
+qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A,
+           size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy)
+{
+    return -1;  // QBLAS is disabled, GEMV not available
+}
+
+int
+qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha,
+           Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C,
+           size_t ldc)
+{
+    return -1;  // QBLAS is disabled, GEMM not available
+}
+
+int
+qblas_supports_backend(QuadBackendType backend)
+{
+    return -1; // QBLAS is disabled, backend support not available
+}
+
+PyObject *
+py_quadblas_set_num_threads(PyObject *self, PyObject *args)
+{
+    // raise error
+    PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
+    return NULL;
+}
+
+PyObject *
+py_quadblas_get_num_threads(PyObject *self, PyObject *args)
+{
+    // raise error
+    PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
+    return NULL;
+}
+
+PyObject *
+py_quadblas_get_version(PyObject *self, PyObject *args)
+{
+    // raise error
+    PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
+    return NULL;
+}
+
+int
+quadblas_set_num_threads(int num_threads)
+{
+    // raise error
+    PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
+    return -1;
+}
+
+int
+quadblas_get_num_threads(void)
+{
+    // raise error
+    PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
+    return -1;
+}
+
+#endif // DISABLE_QUADBLAS
+
 }  // extern "C"
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.h b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
index ff9ed537..82a685a9 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.h
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
@@ -32,6 +32,11 @@ py_quadblas_get_num_threads(PyObject *self, PyObject *args);
 PyObject *
 py_quadblas_get_version(PyObject *self, PyObject *args);
 
+int
+quadblas_set_num_threads(int num_threads);
+int
+quadblas_get_num_threads(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index f31ec89f..c9fec506 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -289,10 +289,21 @@ init_matmul_ops(PyObject *numpy)
 
     PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
 
+    #ifndef DISABLE_QUADBLAS
+    // set threading to max
+    int num_threads = quadblas_get_num_threads();
+    quadblas_set_num_threads(num_threads);
+    
     PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
                            {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop},
                            {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},
                            {0, NULL}};
+    #else
+    PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
+                           {NPY_METH_strided_loop, (void *)&naive_matmul_strided_loop},
+                           {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},
+                           {0, NULL}};
+    #endif // DISABLE_QUADBLAS
 
     PyArrayMethod_Spec Spec = {
             .name = "quad_matmul_qblas",

From 5e5fa659fcce5e3daa3a1d7cd771244290ea8531 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 18:26:27 +0000
Subject: [PATCH 17/20] switching to internal function use only

---
 quaddtype/numpy_quaddtype/src/quadblas_interface.cpp | 8 ++++----
 quaddtype/numpy_quaddtype/src/quadblas_interface.h   | 4 ++--
 quaddtype/numpy_quaddtype/src/umath/matmul.cpp       | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index 6eb57574..65feb604 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -145,14 +145,14 @@ py_quadblas_get_version(PyObject *self, PyObject *args)
 }
 
 int
-quadblas_set_num_threads(int num_threads)
+_quadblas_set_num_threads(int num_threads)
 {
     QuadBLAS::set_num_threads(num_threads);
     return 0;
 }
 
 int
-quadblas_get_num_threads(void)
+_quadblas_get_num_threads(void)
 {
     int num_threads = QuadBLAS::get_num_threads();
     return num_threads;
@@ -213,7 +213,7 @@ py_quadblas_get_version(PyObject *self, PyObject *args)
 }
 
 int
-quadblas_set_num_threads(int num_threads)
+_quadblas_set_num_threads(int num_threads)
 {
     // raise error
     PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
@@ -221,7 +221,7 @@ quadblas_set_num_threads(int num_threads)
 }
 
 int
-quadblas_get_num_threads(void)
+_quadblas_get_num_threads(void)
 {
     // raise error
     PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.h b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
index 82a685a9..76033ebc 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.h
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
@@ -33,9 +33,9 @@ PyObject *
 py_quadblas_get_version(PyObject *self, PyObject *args);
 
 int
-quadblas_set_num_threads(int num_threads);
+_quadblas_set_num_threads(int num_threads);
 int
-quadblas_get_num_threads(void);
+_quadblas_get_num_threads(void);
 
 #ifdef __cplusplus
 }
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index c9fec506..98053b82 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -291,9 +291,9 @@ init_matmul_ops(PyObject *numpy)
 
     #ifndef DISABLE_QUADBLAS
     // set threading to max
-    int num_threads = quadblas_get_num_threads();
-    quadblas_set_num_threads(num_threads);
-    
+    int num_threads = _quadblas_get_num_threads();
+    _quadblas_set_num_threads(num_threads);
+
     PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
                            {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop},
                            {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},

From cec5ace3a011cb38c57bf0cd33b99b5df0d36cc6 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sun, 20 Jul 2025 03:46:52 +0530
Subject: [PATCH 18/20] this should fix them all

---
 quaddtype/numpy_quaddtype/QBLAS               |   2 +-
 .../numpy_quaddtype/src/umath/matmul.cpp      | 216 +++++++++++++-----
 2 files changed, 160 insertions(+), 58 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS
index 0eabb677..9468e24a 160000
--- a/quaddtype/numpy_quaddtype/QBLAS
+++ b/quaddtype/numpy_quaddtype/QBLAS
@@ -1 +1 @@
-Subproject commit 0eabb677431c6148434c50deba7abd6902d74b16
+Subproject commit 9468e24a02b731563eba2aee0350e9219b36c102
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index 98053b82..354a342d 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -93,8 +93,9 @@ determine_operation_type(npy_intp m, npy_intp n, npy_intp p)
 }
 
 static int
-quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
-                         npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata)
+quad_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                 npy_intp const dimensions[], npy_intp const strides[],
+                                 NpyAuxData *auxdata)
 {
     // Extract dimensions
     npy_intp N = dimensions[0];  // Batch size, this remains always 1 for matmul afaik
@@ -149,6 +150,8 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
             size_t incx = B_row_stride / sizeof(Sleef_quad);
             size_t incy = C_row_stride / sizeof(Sleef_quad);
 
+            memset(C_ptr, 0, m * p * sizeof(Sleef_quad));
+
             result =
                     qblas_gemv('R', 'N', m, n, &alpha, A_ptr, lda, B_ptr, incx, &beta, C_ptr, incy);
             break;
@@ -159,32 +162,132 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
             size_t ldb = B_row_stride / sizeof(Sleef_quad);
             size_t ldc_numpy = C_row_stride / sizeof(Sleef_quad);
 
+            memset(C_ptr, 0, m * p * sizeof(Sleef_quad));
+
+            size_t ldc_temp = p;
+
+            result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, &beta,
+                                C_ptr, ldc_numpy);
+            break;
+        }
+    }
+
+    if (result != 0) {
+        PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int
+quad_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                   npy_intp const dimensions[], npy_intp const strides[],
+                                   NpyAuxData *auxdata)
+{
+    // Extract dimensions
+    npy_intp N = dimensions[0];  // Batch size, this remains always 1 for matmul afaik
+    npy_intp m = dimensions[1];  // Rows of first matrix
+    npy_intp n = dimensions[2];  // Cols of first matrix / rows of second matrix
+    npy_intp p = dimensions[3];  // Cols of second matrix
+
+    // batch strides
+    npy_intp A_stride = strides[0];
+    npy_intp B_stride = strides[1];
+    npy_intp C_stride = strides[2];
+
+    // core strides for matrix dimensions
+    npy_intp A_row_stride = strides[3];
+    npy_intp A_col_stride = strides[4];
+    npy_intp B_row_stride = strides[5];
+    npy_intp B_col_stride = strides[6];
+    npy_intp C_row_stride = strides[7];
+    npy_intp C_col_stride = strides[8];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    if (descr->backend != BACKEND_SLEEF) {
+        PyErr_SetString(PyExc_RuntimeError, "Internal error: non-SLEEF backend in QBLAS matmul");
+        return -1;
+    }
+
+    MatmulOperationType op_type = determine_operation_type(m, n, p);
+    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
+    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
+
+    char *A = data[0];
+    char *B = data[1];
+    char *C = data[2];
+
+    Sleef_quad *A_ptr = (Sleef_quad *)A;
+    Sleef_quad *B_ptr = (Sleef_quad *)B;
+    Sleef_quad *C_ptr = (Sleef_quad *)C;
+
+    int result = -1;
+
+    switch (op_type) {
+        case MATMUL_DOT: {
+            Sleef_quad *temp_A_buffer = new Sleef_quad[n];
+            Sleef_quad *temp_B_buffer = new Sleef_quad[n];
+
+            memcpy(temp_A_buffer, A_ptr, n * sizeof(Sleef_quad));
+            memcpy(temp_B_buffer, B_ptr, n * sizeof(Sleef_quad));
+
+            size_t incx = 1;
+            size_t incy = 1;
+
+            result = qblas_dot(n, temp_A_buffer, incx, temp_B_buffer, incy, C_ptr);
+
+            delete[] temp_A_buffer;
+            delete[] temp_B_buffer;
+            break;
+        }
+
+        case MATMUL_GEMV: {
+            size_t lda = A_row_stride / sizeof(Sleef_quad);
+            size_t incx = B_row_stride / sizeof(Sleef_quad);
+            size_t incy = C_row_stride / sizeof(Sleef_quad);
+
             Sleef_quad *temp_A_buffer = new Sleef_quad[m * n];
-            if (!temp_A_buffer) {
-                PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffer for GEMM");
-                delete[] temp_A_buffer;
-                return -1;
-            }
             Sleef_quad *temp_B_buffer = new Sleef_quad[n * p];
-            if (!temp_B_buffer) {
-                PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffer for GEMM");
-                delete[] temp_A_buffer;
-                return -1;
-            }
             memcpy(temp_A_buffer, A_ptr, m * n * sizeof(Sleef_quad));
             memcpy(temp_B_buffer, B_ptr, n * p * sizeof(Sleef_quad));
             A_ptr = temp_A_buffer;
             B_ptr = temp_B_buffer;
 
+            // Use temp_C_buffer to avoid unaligned writes
             Sleef_quad *temp_C_buffer = new Sleef_quad[m * p];
-            if (!temp_C_buffer) {
-                PyErr_SetString(PyExc_MemoryError,
-                                "Failed to allocate temporary buffer for GEMM result");
-                return -1;
-            }
 
+            lda = n;
+            incx = 1;
+            incy = 1;
+
+            memset(temp_C_buffer, 0, m * p * sizeof(Sleef_quad));
+
+            result = qblas_gemv('R', 'N', m, n, &alpha, A_ptr, lda, B_ptr, incx, &beta,
+                                temp_C_buffer, incy);
+            break;
+        }
+
+        case MATMUL_GEMM: {
+            size_t lda = A_row_stride / sizeof(Sleef_quad);
+            size_t ldb = B_row_stride / sizeof(Sleef_quad);
+            size_t ldc_numpy = C_row_stride / sizeof(Sleef_quad);
+
+            Sleef_quad *temp_A_buffer = new Sleef_quad[m * n];
+            Sleef_quad *temp_B_buffer = new Sleef_quad[n * p];
+            memcpy(temp_A_buffer, A_ptr, m * n * sizeof(Sleef_quad));
+            memcpy(temp_B_buffer, B_ptr, n * p * sizeof(Sleef_quad));
+            A_ptr = temp_A_buffer;
+            B_ptr = temp_B_buffer;
+
+            // since these are now contiguous so,
+            lda = n;
+            ldb = p;
             size_t ldc_temp = p;
 
+            Sleef_quad *temp_C_buffer = new Sleef_quad[m * p];
+            memset(temp_C_buffer, 0, m * p * sizeof(Sleef_quad));
+
             result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, &beta,
                                 temp_C_buffer, ldc_temp);
 
@@ -218,8 +321,8 @@ naive_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
     npy_intp p = dimensions[3];
 
     npy_intp A_batch_stride = strides[0];
-    npy_intp B_batch_stride = strides[1];
-    npy_intp C_batch_stride = strides[2];
+    npy_intp B_stride = strides[1];
+    npy_intp C_stride = strides[2];
 
     npy_intp A_row_stride = strides[3];
     npy_intp A_col_stride = strides[4];
@@ -232,46 +335,44 @@ naive_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
     QuadBackendType backend = descr->backend;
     size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
 
-    for (npy_intp batch = 0; batch < N; batch++) {
-        char *A_batch = data[0] + batch * A_batch_stride;
-        char *B_batch = data[1] + batch * B_batch_stride;
-        char *C_batch = data[2] + batch * C_batch_stride;
-
-        for (npy_intp i = 0; i < m; i++) {
-            for (npy_intp j = 0; j < p; j++) {
-                char *C_ij = C_batch + i * C_row_stride + j * C_col_stride;
+    char *A = data[0];
+    char *B = data[1];
+    char *C = data[2];
 
-                if (backend == BACKEND_SLEEF) {
-                    Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);
+    for (npy_intp i = 0; i < m; i++) {
+        for (npy_intp j = 0; j < p; j++) {
+            char *C_ij = C + i * C_row_stride + j * C_col_stride;
 
-                    for (npy_intp k = 0; k < n; k++) {
-                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
-                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+            if (backend == BACKEND_SLEEF) {
+                Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);
 
-                        Sleef_quad a_val, b_val;
-                        memcpy(&a_val, A_ik, sizeof(Sleef_quad));
-                        memcpy(&b_val, B_kj, sizeof(Sleef_quad));
-                        sum = Sleef_fmaq1_u05(a_val, b_val, sum);
-                    }
+                for (npy_intp k = 0; k < n; k++) {
+                    char *A_ik = A + i * A_row_stride + k * A_col_stride;
+                    char *B_kj = B + k * B_row_stride + j * B_col_stride;
 
-                    memcpy(C_ij, &sum, sizeof(Sleef_quad));
+                    Sleef_quad a_val, b_val;
+                    memcpy(&a_val, A_ik, sizeof(Sleef_quad));
+                    memcpy(&b_val, B_kj, sizeof(Sleef_quad));
+                    sum = Sleef_fmaq1_u05(a_val, b_val, sum);
                 }
-                else {
-                    long double sum = 0.0L;
 
-                    for (npy_intp k = 0; k < n; k++) {
-                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
-                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+                memcpy(C_ij, &sum, sizeof(Sleef_quad));
+            }
+            else {
+                long double sum = 0.0L;
 
-                        long double a_val, b_val;
-                        memcpy(&a_val, A_ik, sizeof(long double));
-                        memcpy(&b_val, B_kj, sizeof(long double));
+                for (npy_intp k = 0; k < n; k++) {
+                    char *A_ik = A + i * A_row_stride + k * A_col_stride;
+                    char *B_kj = B + k * B_row_stride + j * B_col_stride;
 
-                        sum += a_val * b_val;
-                    }
+                    long double a_val, b_val;
+                    memcpy(&a_val, A_ik, sizeof(long double));
+                    memcpy(&b_val, B_kj, sizeof(long double));
 
-                    memcpy(C_ij, &sum, sizeof(long double));
+                    sum += a_val * b_val;
                 }
+
+                memcpy(C_ij, &sum, sizeof(long double));
             }
         }
     }
@@ -289,21 +390,22 @@ init_matmul_ops(PyObject *numpy)
 
     PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
 
-    #ifndef DISABLE_QUADBLAS
+#ifndef DISABLE_QUADBLAS
     // set threading to max
     int num_threads = _quadblas_get_num_threads();
     _quadblas_set_num_threads(num_threads);
 
-    PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
-                           {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop},
-                           {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},
-                           {0, NULL}};
-    #else
+    PyType_Slot slots[] = {
+            {NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
+            {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop_aligned},
+            {NPY_METH_unaligned_strided_loop, (void *)&quad_matmul_strided_loop_unaligned},
+            {0, NULL}};
+#else
     PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
                            {NPY_METH_strided_loop, (void *)&naive_matmul_strided_loop},
                            {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},
                            {0, NULL}};
-    #endif // DISABLE_QUADBLAS
+#endif  // DISABLE_QUADBLAS
 
     PyArrayMethod_Spec Spec = {
             .name = "quad_matmul_qblas",
@@ -335,7 +437,7 @@ init_matmul_ops(PyObject *numpy)
     }
 
     if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
-        PyErr_Clear();  // Don't fail if promoter fails
+        PyErr_Clear();
     }
     else {
     }

From 1fe6c8172582ec2d24d34640afef72f53e2c078d Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sun, 20 Jul 2025 04:14:10 +0530
Subject: [PATCH 19/20] wrapping up

---
 quaddtype/README.md                          |  8 +++++--
 quaddtype/numpy_quaddtype/src/umath/matmul.h | 23 --------------------
 quaddtype/release_tracker.md                 |  2 --
 3 files changed, 6 insertions(+), 27 deletions(-)

diff --git a/quaddtype/README.md b/quaddtype/README.md
index af4ddef7..11eef1d2 100644
--- a/quaddtype/README.md
+++ b/quaddtype/README.md
@@ -27,7 +27,7 @@ np.array([1,2,3], dtype=QuadPrecDType("longdouble"))
 
 The code needs the quad precision pieces of the sleef library, which
 is not available on most systems by default, so we have to generate
-that first.  The below assumes one has the required pieces to build
+that first. The below assumes one has the required pieces to build
 sleef (cmake and libmpfr-dev), and that one is in the package
 directory locally.
 
@@ -40,6 +40,7 @@ cd ..
 ```
 
 Building the `numpy-quaddtype` package from locally installed sleef:
+
 ```bash
 export SLEEF_DIR=$PWD/sleef/build
 export LIBRARY_PATH=$SLEEF_DIR/lib
@@ -57,10 +58,13 @@ export LDFLAGS="-Wl,-rpath,$SLEEF_DIR/lib -fopenmp -latomic -lpthread"
 export CFLAGS="-fPIC"
 export CXXFLAGS="-fPIC"
 
+# To build without QBLAS (default for MSVC)
+# export CFLAGS="-fPIC -DDISABLE_QUADBLAS"
+# export CXXFLAGS="-fPIC -DDISABLE_QUADBLAS"
+
 python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v'
 
 # Run the tests
 cd ..
 python -m pytest
 ```
-
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.h b/quaddtype/numpy_quaddtype/src/umath/matmul.h
index 947e2c3d..12858497 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.h
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.h
@@ -1,35 +1,12 @@
 #ifndef _QUADDTYPE_MATMUL_H
 #define _QUADDTYPE_MATMUL_H
 
-/**
- * Quad Precision Matrix Multiplication for NumPy
- *
- * This module implements matrix multiplication functionality for the QuadPrecDType
- * by registering custom loops with numpy's matmul generalized ufunc.
- *
- * Supports all matmul operation types:
- * - Vector-vector (dot product): (n,) @ (n,) -> scalar
- * - Matrix-vector: (m,n) @ (n,) -> (m,)
- * - Vector-matrix: (n,) @ (n,p) -> (p,)
- * - Matrix-matrix: (m,n) @ (n,p) -> (m,p)
- *
- * Uses naive algorithms optimized for correctness rather than performance.
- * For production use, consider integration with QBLAS optimized routines.
- */
-
 #include <Python.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/**
- * Initialize the matmul operations for the quad precision dtype.
- * This function registers the matmul generalized ufunc with numpy.
- *
- * @param numpy The numpy module object
- * @return 0 on success, -1 on failure
- */
 int
 init_matmul_ops(PyObject *numpy);
 
diff --git a/quaddtype/release_tracker.md b/quaddtype/release_tracker.md
index 3ed10042..fbd3b205 100644
--- a/quaddtype/release_tracker.md
+++ b/quaddtype/release_tracker.md
@@ -91,5 +91,3 @@
 | floor         | ✅    |
 | ceil          | ✅    |
 | trunc         | ✅    |
-
-- Fixing QBLAS integration to work unaligned arrays without or recovering from bad allocation fallback

From 8f16b9904b9360e6daa902cfd400816ce8a759ff Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sun, 20 Jul 2025 04:27:01 +0530
Subject: [PATCH 20/20] updated branch to main

---
 .github/workflows/build_wheels.yml | 2 +-
 .github/workflows/ci.yml           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 0bf55c46..37a80387 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -3,7 +3,7 @@ name: Build Wheels
 on:
   push:
     branches:
-      - dot
+      - main
     tags:
       - "quaddtype-v*"
     paths:
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e42c12c8..542093c5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,7 +3,7 @@ name: Numpy User DTypes CI
 on:
   push:
     branches:
-      - dot
+      - main
   pull_request:
   workflow_dispatch: