From bc333df1fe32d3dda149ba1c2aa34653d62e68d7 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 9 Jul 2025 13:01:30 +0000
Subject: [PATCH 01/49] interfacing with qblas

---
 .gitmodules                                   |   3 +
 quaddtype/meson.build                         |  14 +-
 quaddtype/numpy_quaddtype/QBLAS               |   1 +
 quaddtype/numpy_quaddtype/__init__.py         |  13 +-
 .../src/quadblas_interface.cpp                | 427 ++++++++++++++++++
 .../numpy_quaddtype/src/quadblas_interface.h  |  27 ++
 .../numpy_quaddtype/src/quaddtype_main.c      |   5 +
 7 files changed, 485 insertions(+), 5 deletions(-)
 create mode 100644 .gitmodules
 create mode 160000 quaddtype/numpy_quaddtype/QBLAS
 create mode 100644 quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
 create mode 100644 quaddtype/numpy_quaddtype/src/quadblas_interface.h

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..523c79c
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "quaddtype/numpy_quaddtype/QBLAS"]
+	path = quaddtype/numpy_quaddtype/QBLAS
+	url = https://github.com/SwayamInSync/QBLAS
diff --git a/quaddtype/meson.build b/quaddtype/meson.build
index e7e8dd9..d1c6799 100644
--- a/quaddtype/meson.build
+++ b/quaddtype/meson.build
@@ -23,9 +23,17 @@ incdir_numpy = run_command(py,
   check : true
 ).stdout().strip()
 
+# Add OpenMP dependency (optional, for threading)
+openmp_dep = dependency('openmp', required: false)
+dependencies = [sleef_dep, py_dep]
+if openmp_dep.found()
+    dependencies += openmp_dep
+endif
+
 includes = include_directories(
     [
         incdir_numpy,
+        'numpy_quaddtype/QBLAS/include',
         'numpy_quaddtype/src',
     ]
 )
@@ -45,7 +53,9 @@ srcs = [
     'numpy_quaddtype/src/umath.h',
     'numpy_quaddtype/src/umath.cpp',
     'numpy_quaddtype/src/dragon4.h',
-    'numpy_quaddtype/src/dragon4.c'
+    'numpy_quaddtype/src/dragon4.c',
+    'numpy_quaddtype/src/quadblas_interface.h',
+    'numpy_quaddtype/src/quadblas_interface.cpp'
 ]
 
 py.install_sources(
@@ -60,7 +70,7 @@ py.extension_module('_quaddtype_main',
   srcs,
   link_args: is_windows ? ['/DEFAULTLIB:sleef', '/DEFAULTLIB:sleefquad'] : ['-lsleef', '-lsleefquad'],
   link_language: 'cpp',
-  dependencies: [sleef_dep, py_dep],
+  dependencies: dependencies,
   install: true,
   subdir: 'numpy_quaddtype',
   include_directories: includes
diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS
new file mode 160000
index 0000000..6858d56
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/QBLAS
@@ -0,0 +1 @@
+Subproject commit 6858d565216db78901d8cf72227fe553deae088f
diff --git a/quaddtype/numpy_quaddtype/__init__.py b/quaddtype/numpy_quaddtype/__init__.py
index e469a4c..9033b71 100644
--- a/quaddtype/numpy_quaddtype/__init__.py
+++ b/quaddtype/numpy_quaddtype/__init__.py
@@ -2,13 +2,20 @@
     QuadPrecision,
     QuadPrecDType,
     is_longdouble_128,
-    get_sleef_constant
+    get_sleef_constant,
+    dot,
+    set_num_threads,
+    get_num_threads,
+    get_quadblas_version
 )
 
 __all__ = [
     'QuadPrecision', 'QuadPrecDType', 'SleefQuadPrecision', 'LongDoubleQuadPrecision',
-    'SleefQuadPrecDType', 'LongDoubleQuadPrecDType', 'is_longdouble_128', 'pi', 'e', 
-    'log2e', 'log10e', 'ln2', 'ln10', 'max_value', 'min_value', 'epsilon'
+    'SleefQuadPrecDType', 'LongDoubleQuadPrecDType', 'is_longdouble_128', 
+    # Constants
+    'pi', 'e', 'log2e', 'log10e', 'ln2', 'ln10', 'max_value', 'min_value', 'epsilon', 
+    # QuadBLAS related functions
+    'dot', 'set_num_threads', 'get_num_threads', 'get_quadblas_version'
 ]
 
 def SleefQuadPrecision(value):
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
new file mode 100644
index 0000000..c08320b
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -0,0 +1,427 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+extern "C" {
+#include <Python.h>
+#include "numpy/arrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/dtype_api.h"
+}
+
+#include "scalar.h"
+#include "dtype.h"
+#include "quad_common.h"
+#include "quadblas_interface.h"
+
+// Include QuadBLAS header
+#include "../QBLAS/include/quadblas/quadblas.hpp"
+
+// Helper function to get QuadBLAS layout from numpy array
+static QuadBLAS::Layout get_quadblas_layout(PyArrayObject *arr) {
+    if (PyArray_IS_C_CONTIGUOUS(arr)) {
+        return QuadBLAS::Layout::RowMajor;
+    } else {
+        return QuadBLAS::Layout::ColMajor;
+    }
+}
+
+// Helper function to extract quad data and backend info from QuadPrecDType array
+static bool extract_quad_array_info(PyArrayObject *arr, Sleef_quad **data, 
+                                   QuadBackendType *backend, QuadBLAS::Layout *layout) {
+    if (!PyArray_Check(arr)) {
+        PyErr_SetString(PyExc_TypeError, "Expected numpy array");
+        return false;
+    }
+    
+    PyArray_Descr *descr = PyArray_DESCR(arr);
+    if (!PyObject_TypeCheck(descr, (PyTypeObject*)&QuadPrecDType)) {
+        PyErr_SetString(PyExc_TypeError, "Array must have QuadPrecDType dtype");
+        return false;
+    }
+    
+    QuadPrecDTypeObject *quad_descr = (QuadPrecDTypeObject*)descr;
+    *backend = quad_descr->backend;
+    *data = (Sleef_quad*)PyArray_DATA(arr);
+    *layout = get_quadblas_layout(arr);
+    
+    return true;
+}
+
+// Helper function to convert between backends if needed
+static Sleef_quad* ensure_sleef_backend(PyArrayObject *arr, QuadBackendType original_backend, 
+                                       Sleef_quad **temp_storage) {
+    if (original_backend == BACKEND_SLEEF) {
+        *temp_storage = nullptr;
+        return (Sleef_quad*)PyArray_DATA(arr);
+    }
+    
+    // Need to convert from longdouble to sleef
+    npy_intp size = PyArray_SIZE(arr);
+    *temp_storage = QuadBLAS::aligned_alloc<Sleef_quad>(size);
+    if (!*temp_storage) {
+        PyErr_NoMemory();
+        return nullptr;
+    }
+    
+    long double *ld_data = (long double*)PyArray_DATA(arr);
+    for (npy_intp i = 0; i < size; i++) {
+        (*temp_storage)[i] = Sleef_cast_from_doubleq1((double)ld_data[i]);
+    }
+    
+    return *temp_storage;
+}
+
+// Vector-Vector dot product
+static PyObject* dot_vector_vector(PyArrayObject *a, PyArrayObject *b) {
+    // Validate dimensions
+    if (PyArray_NDIM(a) != 1 || PyArray_NDIM(b) != 1) {
+        PyErr_SetString(PyExc_ValueError, "Both inputs must be 1-dimensional arrays");
+        return nullptr;
+    }
+    
+    npy_intp n_a = PyArray_DIM(a, 0);
+    npy_intp n_b = PyArray_DIM(b, 0);
+    
+    if (n_a != n_b) {
+        PyErr_SetString(PyExc_ValueError, "Arrays must have the same length");
+        return nullptr;
+    }
+    
+    // Extract data and backend info
+    Sleef_quad *data_a, *data_b;
+    QuadBackendType backend_a, backend_b;
+    QuadBLAS::Layout layout_a, layout_b;
+    
+    if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
+        !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
+        return nullptr;
+    }
+    
+    // Convert to SLEEF backend if needed (QuadBLAS uses SLEEF internally)
+    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
+    Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
+    Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
+    
+    if (!sleef_a || !sleef_b) {
+        QuadBLAS::aligned_free(temp_a);
+        QuadBLAS::aligned_free(temp_b);
+        return nullptr;
+    }
+    
+    // Get strides in terms of elements (not bytes)
+    npy_intp stride_a = PyArray_STRIDE(a, 0) / PyArray_ITEMSIZE(a);
+    npy_intp stride_b = PyArray_STRIDE(b, 0) / PyArray_ITEMSIZE(b);
+    
+    // Perform dot product using QuadBLAS
+    Sleef_quad result = QuadBLAS::dot(n_a, sleef_a, stride_a, sleef_b, stride_b);
+    
+    // Clean up temporary storage
+    QuadBLAS::aligned_free(temp_a);
+    QuadBLAS::aligned_free(temp_b);
+    
+    // Determine result backend (prefer SLEEF, fall back to common backend)
+    QuadBackendType result_backend = BACKEND_SLEEF;
+    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
+        result_backend = BACKEND_LONGDOUBLE;
+    }
+    
+    // Create result scalar
+    QuadPrecisionObject *result_obj = QuadPrecision_raw_new(result_backend);
+    if (!result_obj) {
+        return nullptr;
+    }
+    
+    if (result_backend == BACKEND_SLEEF) {
+        result_obj->value.sleef_value = result;
+    } else {
+        result_obj->value.longdouble_value = (long double)Sleef_cast_to_doubleq1(result);
+    }
+    
+    return (PyObject*)result_obj;
+}
+
+// Matrix-Vector multiplication
+static PyObject* dot_matrix_vector(PyArrayObject *a, PyArrayObject *b) {
+    // Validate dimensions
+    if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 1) {
+        PyErr_SetString(PyExc_ValueError, "First input must be 2D, second input must be 1D");
+        return nullptr;
+    }
+    
+    npy_intp m = PyArray_DIM(a, 0);
+    npy_intp n = PyArray_DIM(a, 1);
+    npy_intp n_b = PyArray_DIM(b, 0);
+    
+    if (n != n_b) {
+        PyErr_SetString(PyExc_ValueError, "Matrix columns must match vector length");
+        return nullptr;
+    }
+    
+    // Extract data and backend info
+    Sleef_quad *data_a, *data_b;
+    QuadBackendType backend_a, backend_b;
+    QuadBLAS::Layout layout_a, layout_b;
+    
+    if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
+        !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
+        return nullptr;
+    }
+    
+    // Convert to SLEEF backend if needed
+    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
+    Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
+    Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
+    
+    if (!sleef_a || !sleef_b) {
+        QuadBLAS::aligned_free(temp_a);
+        QuadBLAS::aligned_free(temp_b);
+        return nullptr;
+    }
+    
+    // Determine result backend
+    QuadBackendType result_backend = BACKEND_SLEEF;
+    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
+        result_backend = BACKEND_LONGDOUBLE;
+    }
+    
+    // Create result array (1D with length m)
+    npy_intp result_dims[1] = {m};
+    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
+    if (!result_dtype) {
+        QuadBLAS::aligned_free(temp_a);
+        QuadBLAS::aligned_free(temp_b);
+        return nullptr;
+    }
+    
+    PyArrayObject *result = (PyArrayObject*)PyArray_Empty(1, result_dims, 
+                                                         (PyArray_Descr*)result_dtype, 0);
+    if (!result) {
+        QuadBLAS::aligned_free(temp_a);
+        QuadBLAS::aligned_free(temp_b);
+        Py_DECREF(result_dtype);
+        return nullptr;
+    }
+    
+    Sleef_quad *result_data = (Sleef_quad*)PyArray_DATA(result);
+    
+    // FIXED: Calculate leading dimensions and strides correctly
+    npy_intp lda;
+    if (layout_a == QuadBLAS::Layout::RowMajor) {
+        lda = n;  // For row-major, leading dimension is number of columns
+    } else {
+        lda = m;  // For column-major, leading dimension is number of rows
+    }
+    
+    npy_intp stride_b = PyArray_STRIDE(b, 0) / PyArray_ITEMSIZE(b);
+    npy_intp stride_result = PyArray_STRIDE(result, 0) / PyArray_ITEMSIZE(result);
+    
+    // Perform matrix-vector multiplication using QuadBLAS
+    // y = 1.0 * A * x + 0.0 * y
+    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
+    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
+    
+    QuadBLAS::gemv(layout_a, m, n, alpha, sleef_a, lda, 
+                   sleef_b, stride_b, beta, result_data, stride_result);
+    
+    // Convert result back to longdouble if needed
+    if (result_backend == BACKEND_LONGDOUBLE) {
+        long double *ld_result = (long double*)PyArray_DATA(result);
+        for (npy_intp i = 0; i < m; i++) {
+            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+        }
+    }
+    
+    // Clean up temporary storage
+    QuadBLAS::aligned_free(temp_a);
+    QuadBLAS::aligned_free(temp_b);
+    
+    return (PyObject*)result;
+}
+
+// Matrix-Matrix multiplication
+static PyObject* dot_matrix_matrix(PyArrayObject *a, PyArrayObject *b) {
+    // Validate dimensions
+    if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 2) {
+        PyErr_SetString(PyExc_ValueError, "Both inputs must be 2-dimensional arrays");
+        return nullptr;
+    }
+    
+    npy_intp m = PyArray_DIM(a, 0);
+    npy_intp k = PyArray_DIM(a, 1);
+    npy_intp k_b = PyArray_DIM(b, 0);
+    npy_intp n = PyArray_DIM(b, 1);
+    
+    if (k != k_b) {
+        PyErr_SetString(PyExc_ValueError, "Matrix inner dimensions must match");
+        return nullptr;
+    }
+    
+    // Extract data and backend info
+    Sleef_quad *data_a, *data_b;
+    QuadBackendType backend_a, backend_b;
+    QuadBLAS::Layout layout_a, layout_b;
+    
+    if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
+        !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
+        return nullptr;
+    }
+    
+    // Convert to SLEEF backend if needed
+    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
+    Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
+    Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
+    
+    if (!sleef_a || !sleef_b) {
+        QuadBLAS::aligned_free(temp_a);
+        QuadBLAS::aligned_free(temp_b);
+        return nullptr;
+    }
+    
+    // Determine result backend
+    QuadBackendType result_backend = BACKEND_SLEEF;
+    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
+        result_backend = BACKEND_LONGDOUBLE;
+    }
+    
+    // Create result array (2D with shape m x n)
+    npy_intp result_dims[2] = {m, n};
+    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
+    if (!result_dtype) {
+        QuadBLAS::aligned_free(temp_a);
+        QuadBLAS::aligned_free(temp_b);
+        return nullptr;
+    }
+    
+    PyArrayObject *result = (PyArrayObject*)PyArray_Empty(2, result_dims, 
+                                                         (PyArray_Descr*)result_dtype, 0);
+    if (!result) {
+        QuadBLAS::aligned_free(temp_a);
+        QuadBLAS::aligned_free(temp_b);
+        Py_DECREF(result_dtype);
+        return nullptr;
+    }
+    
+    Sleef_quad *result_data = (Sleef_quad*)PyArray_DATA(result);
+    
+    // FIXED: Calculate leading dimensions correctly
+    npy_intp lda, ldb, ldc;
+    
+    if (layout_a == QuadBLAS::Layout::RowMajor) {
+        lda = k;  // For row-major A: leading dimension is number of columns
+    } else {
+        lda = m;  // For column-major A: leading dimension is number of rows
+    }
+    
+    if (layout_b == QuadBLAS::Layout::RowMajor) {
+        ldb = n;  // For row-major B: leading dimension is number of columns
+    } else {
+        ldb = k;  // For column-major B: leading dimension is number of rows
+    }
+    
+    // Result array layout - assume same as input A
+    QuadBLAS::Layout result_layout = layout_a;
+    if (result_layout == QuadBLAS::Layout::RowMajor) {
+        ldc = n;  // For row-major C: leading dimension is number of columns
+    } else {
+        ldc = m;  // For column-major C: leading dimension is number of rows
+    }
+    
+    // Perform matrix-matrix multiplication using QuadBLAS
+    // C = 1.0 * A * B + 0.0 * C
+    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
+    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
+    
+    QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, 
+                   sleef_b, ldb, beta, result_data, ldc);
+    
+    // Convert result back to longdouble if needed
+    if (result_backend == BACKEND_LONGDOUBLE) {
+        long double *ld_result = (long double*)PyArray_DATA(result);
+        for (npy_intp i = 0; i < m * n; i++) {
+            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+        }
+    }
+    
+    // Clean up temporary storage
+    QuadBLAS::aligned_free(temp_a);
+    QuadBLAS::aligned_free(temp_b);
+    
+    return (PyObject*)result;
+}
+
+// Main dot function that dispatches based on input dimensions
+PyObject* py_quadblas_dot(PyObject* self, PyObject* args) {
+    PyObject *a_obj, *b_obj;
+    
+    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+        return nullptr;
+    }
+    
+    // Convert to arrays if needed
+    PyArrayObject *a = (PyArrayObject*)PyArray_FROM_OF(a_obj, NPY_ARRAY_ALIGNED);
+    PyArrayObject *b = (PyArrayObject*)PyArray_FROM_OF(b_obj, NPY_ARRAY_ALIGNED);
+    
+    if (!a || !b) {
+        Py_XDECREF(a);
+        Py_XDECREF(b);
+        PyErr_SetString(PyExc_TypeError, "Inputs must be convertible to arrays");
+        return nullptr;
+    }
+    
+    PyObject *result = nullptr;
+    
+    // Dispatch based on dimensions
+    int ndim_a = PyArray_NDIM(a);
+    int ndim_b = PyArray_NDIM(b);
+    
+    if (ndim_a == 1 && ndim_b == 1) {
+        // Vector-Vector dot product
+        result = dot_vector_vector(a, b);
+    } else if (ndim_a == 2 && ndim_b == 1) {
+        // Matrix-Vector multiplication
+        result = dot_matrix_vector(a, b);
+    } else if (ndim_a == 2 && ndim_b == 2) {
+        // Matrix-Matrix multiplication
+        result = dot_matrix_matrix(a, b);
+    } else if (ndim_a == 1 && ndim_b == 2) {
+        PyErr_SetString(PyExc_ValueError, 
+            "Vector-Matrix multiplication not supported (use Matrix-Vector instead)");
+    } else {
+        PyErr_SetString(PyExc_ValueError, 
+            "Unsupported array dimensions. Supported: (1D,1D), (2D,1D), (2D,2D)");
+    }
+    
+    Py_DECREF(a);
+    Py_DECREF(b);
+    
+    return result;
+}
+
+// Threading control functions
+PyObject* py_quadblas_set_num_threads(PyObject* self, PyObject* args) {
+    int num_threads;
+    
+    if (!PyArg_ParseTuple(args, "i", &num_threads)) {
+        return nullptr;
+    }
+    
+    if (num_threads < 1) {
+        PyErr_SetString(PyExc_ValueError, "Number of threads must be positive");
+        return nullptr;
+    }
+    
+    QuadBLAS::set_num_threads(num_threads);
+    Py_RETURN_NONE;
+}
+
+PyObject* py_quadblas_get_num_threads(PyObject* self, PyObject* args) {
+    return PyLong_FromLong(QuadBLAS::get_num_threads());
+}
+
+PyObject* py_quadblas_get_version(PyObject* self, PyObject* args) {
+    return PyUnicode_FromString(QuadBLAS::VERSION);
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.h b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
new file mode 100644
index 0000000..1bdfdfb
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
@@ -0,0 +1,27 @@
+#ifndef _QUADDTYPE_QUADBLAS_INTERFACE_H
+#define _QUADDTYPE_QUADBLAS_INTERFACE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <Python.h>
+
+// Main dot function that handles vector-vector, matrix-vector, and matrix-matrix operations
+// Renamed to avoid conflicts with QuadBLAS C interface
+PyObject* py_quadblas_dot(PyObject* self, PyObject* args);
+
+// Threading control functions
+// Renamed to avoid conflicts with QuadBLAS C interface
+PyObject* py_quadblas_set_num_threads(PyObject* self, PyObject* args);
+PyObject* py_quadblas_get_num_threads(PyObject* self, PyObject* args);
+
+// Version information
+// Renamed to avoid conflicts with QuadBLAS C interface
+PyObject* py_quadblas_get_version(PyObject* self, PyObject* args);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _QUADDTYPE_QUADBLAS_INTERFACE_H
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/quaddtype_main.c b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
index f293529..4a9af5d 100644
--- a/quaddtype/numpy_quaddtype/src/quaddtype_main.c
+++ b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
@@ -16,6 +16,7 @@
 #include "dtype.h"
 #include "umath.h"
 #include "quad_common.h"
+#include "quadblas_interface.h"
 #include "float.h"
 
 
@@ -71,6 +72,10 @@ static PyObject* get_sleef_constant(PyObject* self, PyObject* args) {
 static PyMethodDef module_methods[] = {
     {"is_longdouble_128", py_is_longdouble_128, METH_NOARGS, "Check if long double is 128-bit"},
     {"get_sleef_constant", get_sleef_constant, METH_VARARGS, "Get Sleef constant by name"},
+    {"dot", py_quadblas_dot, METH_VARARGS, "Optimized dot product using QuadBLAS"},
+    {"set_num_threads", py_quadblas_set_num_threads, METH_VARARGS, "Set number of threads for QuadBLAS"},
+    {"get_num_threads", py_quadblas_get_num_threads, METH_NOARGS, "Get number of threads for QuadBLAS"},
+    {"get_quadblas_version", py_quadblas_get_version, METH_NOARGS, "Get QuadBLAS version"},
     {NULL, NULL, 0, NULL} 
 };
 

From babaa96c6365820383d69a744adc5f48412b413b Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 9 Jul 2025 17:21:58 +0000
Subject: [PATCH 02/49] adding test cases

---
 quaddtype/numpy_quaddtype/QBLAS                |  2 +-
 quaddtype/numpy_quaddtype/__init__.py          | 10 ++++++++--
 quaddtype/numpy_quaddtype/src/quaddtype_main.c |  2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS
index 6858d56..3d74bc5 160000
--- a/quaddtype/numpy_quaddtype/QBLAS
+++ b/quaddtype/numpy_quaddtype/QBLAS
@@ -1 +1 @@
-Subproject commit 6858d565216db78901d8cf72227fe553deae088f
+Subproject commit 3d74bc5038d7b44f50b1c1cbd292da9e1b853b56
diff --git a/quaddtype/numpy_quaddtype/__init__.py b/quaddtype/numpy_quaddtype/__init__.py
index 9033b71..b0a9f3b 100644
--- a/quaddtype/numpy_quaddtype/__init__.py
+++ b/quaddtype/numpy_quaddtype/__init__.py
@@ -3,12 +3,14 @@
     QuadPrecDType,
     is_longdouble_128,
     get_sleef_constant,
-    dot,
+    qblas_dot as dot,
     set_num_threads,
     get_num_threads,
     get_quadblas_version
 )
 
+import multiprocessing
+
 __all__ = [
     'QuadPrecision', 'QuadPrecDType', 'SleefQuadPrecision', 'LongDoubleQuadPrecision',
     'SleefQuadPrecDType', 'LongDoubleQuadPrecDType', 'is_longdouble_128', 
@@ -38,4 +40,8 @@ def LongDoubleQuadPrecDType():
 ln10 = get_sleef_constant("ln10")
 max_value = get_sleef_constant("quad_max")
 min_value = get_sleef_constant("quad_min")
-epsilon = get_sleef_constant("epsilon")
\ No newline at end of file
+epsilon = get_sleef_constant("epsilon")
+
+num_cores = multiprocessing.cpu_count()
+# set default number of threads for QuadBLAS
+set_num_threads(num_cores)
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/quaddtype_main.c b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
index 4a9af5d..9e2b843 100644
--- a/quaddtype/numpy_quaddtype/src/quaddtype_main.c
+++ b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
@@ -72,7 +72,7 @@ static PyObject* get_sleef_constant(PyObject* self, PyObject* args) {
 static PyMethodDef module_methods[] = {
     {"is_longdouble_128", py_is_longdouble_128, METH_NOARGS, "Check if long double is 128-bit"},
     {"get_sleef_constant", get_sleef_constant, METH_VARARGS, "Get Sleef constant by name"},
-    {"dot", py_quadblas_dot, METH_VARARGS, "Optimized dot product using QuadBLAS"},
+    {"qblas_dot", py_quadblas_dot, METH_VARARGS, "Optimized dot product using QuadBLAS"},
     {"set_num_threads", py_quadblas_set_num_threads, METH_VARARGS, "Set number of threads for QuadBLAS"},
     {"get_num_threads", py_quadblas_get_num_threads, METH_NOARGS, "Get number of threads for QuadBLAS"},
     {"get_quadblas_version", py_quadblas_get_version, METH_NOARGS, "Get QuadBLAS version"},

From c3aaa052b173b4113a65d2ce669ed98761e502ae Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 9 Jul 2025 19:16:04 +0000
Subject: [PATCH 03/49] test-1: ci

---
 .github/workflows/build_wheels.yml            | 171 +++++---
 quaddtype/README.md                           |   5 +-
 quaddtype/numpy_quaddtype/QBLAS               |   2 +-
 .../src/quadblas_interface.cpp                | 320 +++++++--------
 .../numpy_quaddtype/src/quadblas_interface.h  |  10 +-
 quaddtype/tests/test_dot.py                   | 377 ++++++++++++++++++
 6 files changed, 659 insertions(+), 226 deletions(-)
 create mode 100644 quaddtype/tests/test_dot.py

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 2d1d4d5..d8114a1 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -3,7 +3,7 @@ name: Build Wheels
 on:
   push:
     branches:
-      - main
+      - dot
     tags:
       - "quaddtype-v*"
     paths:
@@ -19,12 +19,44 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
+        with:
+          submodules: recursive
 
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
           python-version: ">=3.10.0"
 
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y cmake build-essential libomp-dev git pkg-config
+
+      - name: Verify QuadBLAS submodule
+        run: |
+          ls -la QuadBLAS/
+          ls -la QuadBLAS/include/quadblas/
+
+      - name: Install SLEEF and setup QuadBLAS
+        run: |
+          # Install SLEEF
+          git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
+          cd sleef
+          cmake -S . -B build \
+            -DSLEEF_BUILD_QUAD:BOOL=ON \
+            -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON \
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+          cmake --build build/ --clean-first -j
+          sudo cmake --install build --prefix /usr/local
+          cd ..
+          
+          # Install QuadBLAS headers from submodule (header-only)
+          sudo cp -r QuadBLAS/include/quadblas /usr/local/include/
+          
+          # Verify installation
+          ls -la /usr/local/include/quadblas/
+          ls -la /usr/local/lib/
+
       - name: Install cibuildwheel
         run: pip install cibuildwheel==2.20.0
 
@@ -34,16 +66,25 @@ jobs:
           CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
           CIBW_BUILD_VERBOSITY: "3"
           CIBW_BEFORE_ALL: |
-            git clone --branch 3.8 https://github.com/shibatch/sleef.git
+            # Install SLEEF
+            git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
             cd sleef
-            cmake -S . -B build -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+            cmake -S . -B build \
+              -DSLEEF_BUILD_QUAD:BOOL=ON \
+              -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON \
+              -DCMAKE_POSITION_INDEPENDENT_CODE=ON
             cmake --build build/ --clean-first -j
             cmake --install build --prefix /usr/local
+            cd ..
+            
+            # Install QuadBLAS headers from submodule
+            cp -r QuadBLAS/include/quadblas /usr/local/include/
           CIBW_ENVIRONMENT: >
-            CFLAGS="-I/usr/local/include $CFLAGS"
-            CXXFLAGS="-I/usr/local/include $CXXFLAGS"
-            LDFLAGS="-L/usr/local/lib64 $LDFLAGS"
-            LD_LIBRARY_PATH="/usr/local/lib64:$LD_LIBRARY_PATH"
+            CFLAGS="-I/usr/local/include -I/usr/local/include/quadblas $CFLAGS"
+            CXXFLAGS="-I/usr/local/include -I/usr/local/include/quadblas -std=c++17 $CXXFLAGS"
+            LDFLAGS="-L/usr/local/lib64 -Wl,-rpath,/usr/local/lib64 -lsleef -lsleefquad -fopenmp -latomic -lpthread $LDFLAGS"
+            LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
+            PKG_CONFIG_PATH="/usr/local/lib64/pkgconfig:/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH"
           CIBW_REPAIR_WHEEL_COMMAND: |
             auditwheel repair -w {dest_dir} --plat manylinux_2_28_x86_64 {wheel}
           CIBW_TEST_COMMAND: |
@@ -68,17 +109,29 @@ jobs:
 
     steps:
       - uses: actions/checkout@v3
+        with:
+          submodules: recursive
 
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
           python-version: "3.10"
 
-      - name: Install SLEEF
+      - name: Install dependencies
+        run: |
+          brew install cmake libomp git
+
+      - name: Verify QuadBLAS submodule
+        run: |
+          ls -la QuadBLAS/
+          ls -la QuadBLAS/include/quadblas/
+
+      - name: Install SLEEF and setup QuadBLAS
         env:
           MACOSX_DEPLOYMENT_TARGET: "11.0"
         run: |
-          git clone --branch 3.8 https://github.com/shibatch/sleef.git
+          # Install SLEEF
+          git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
           cd sleef
           cmake -S . -B build \
             -DSLEEF_BUILD_QUAD:BOOL=ON \
@@ -89,6 +142,11 @@ jobs:
             -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
           cmake --build build/ --clean-first -j
           sudo cmake --install build --prefix /usr/local
+          cd ..
+          
+          # Install QuadBLAS headers from submodule
+          sudo cp -r QuadBLAS/include/quadblas /usr/local/include/
+
       - name: Install cibuildwheel
         run: pip install cibuildwheel==2.20.0
 
@@ -97,12 +155,30 @@ jobs:
           CIBW_BUILD: "cp310-* cp311-* cp312-*"
           CIBW_ARCHS_MACOS: ${{ matrix.os == 'macos-13' && 'x86_64' || 'arm64' }}
           CIBW_BUILD_VERBOSITY: "1"
+          CIBW_BEFORE_ALL: |
+            # Install SLEEF
+            git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
+            cd sleef
+            cmake -S . -B build \
+              -DSLEEF_BUILD_QUAD:BOOL=ON \
+              -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON \
+              -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+              -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \
+              -DCMAKE_INSTALL_RPATH="@loader_path/../lib" \
+              -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
+            cmake --build build/ --clean-first -j
+            cmake --install build --prefix /usr/local
+            cd ..
+            
+            # Install QuadBLAS headers from submodule
+            cp -r QuadBLAS/include/quadblas /usr/local/include/
           CIBW_ENVIRONMENT: >
             MACOSX_DEPLOYMENT_TARGET="11.0"
             DYLD_LIBRARY_PATH="/usr/local/lib:$DYLD_LIBRARY_PATH"
-            CFLAGS="-I/usr/local/include $CFLAGS"
-            CXXFLAGS="-I/usr/local/include $CXXFLAGS"
-            LDFLAGS="-L/usr/local/lib $LDFLAGS"
+            CFLAGS="-I/usr/local/include -I/usr/local/include/quadblas $CFLAGS"
+            CXXFLAGS="-I/usr/local/include -I/usr/local/include/quadblas -std=c++17 $CXXFLAGS"
+            LDFLAGS="-L/usr/local/lib -lsleef -lsleefquad $LDFLAGS"
+            PKG_CONFIG_PATH="/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH"
           CIBW_REPAIR_WHEEL_COMMAND: >
             delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}
           CIBW_TEST_COMMAND: |
@@ -127,6 +203,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v3
+        with:
+          submodules: recursive
 
       - name: Setup MSVC
         uses: ilammy/msvc-dev-cmd@v1
@@ -142,19 +220,31 @@ jobs:
       - name: Install CMake
         uses: lukka/get-cmake@latest
 
+      - name: Verify QuadBLAS submodule
+        shell: pwsh
+        run: |
+          Get-ChildItem QuadBLAS/
+          Get-ChildItem QuadBLAS/include/quadblas/
+
       - name: Clone and Build SLEEF
         shell: pwsh
         run: |
-          git clone --branch 3.8 https://github.com/shibatch/sleef.git
+          git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
           cd sleef
           cmake -S . -B build -G "Visual Studio 17 2022" -A ${{ matrix.architecture == 'x86' && 'Win32' || 'x64' }} -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
           cmake --build build --config Release
           cmake --install build --prefix "C:/sleef" --config Release
 
+      - name: Install QuadBLAS headers from submodule
+        shell: pwsh
+        run: |
+          New-Item -ItemType Directory -Force -Path "C:/quadblas/include"
+          Copy-Item -Recurse -Force "QuadBLAS/include/quadblas" "C:/quadblas/include/"
+
       - name: Setup build environment
         shell: pwsh
         run: |
-          $env:INCLUDE += ";C:\sleef\include"
+          $env:INCLUDE += ";C:\sleef\include;C:\quadblas\include"
           $env:LIB += ";C:\sleef\lib"
           $env:PATH = "C:\sleef\bin;$env:PATH"
           echo "INCLUDE=$env:INCLUDE" >> $env:GITHUB_ENV
@@ -175,8 +265,24 @@ jobs:
           CIBW_BUILD_VERBOSITY: "3"
           DISTUTILS_USE_SDK: "1"
           MSSdk: "1"
+          CIBW_BEFORE_ALL: |
+            git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
+            cd sleef
+            cmake -S . -B build -G "Visual Studio 17 2022" -A ${{ matrix.architecture == 'x86' && 'Win32' || 'x64' }} -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+            cmake --build build --config Release
+            cmake --install build --prefix "C:/sleef" --config Release
+            cd ..
+            mkdir -p C:/quadblas/include
+            cp -r QuadBLAS/include/quadblas C:/quadblas/include/
           CIBW_BEFORE_BUILD: |
             pip install meson meson-python ninja numpy
+          CIBW_ENVIRONMENT: >
+            INCLUDE="C:/sleef/include;C:/quadblas/include;$INCLUDE"
+            LIB="C:/sleef/lib;$LIB"
+            PATH="C:/sleef/bin;$PATH"
+            CFLAGS="/IC:/sleef/include /IC:/quadblas/include $CFLAGS"
+            CXXFLAGS="/IC:/sleef/include /IC:/quadblas/include /std:c++17 $CXXFLAGS"
+            LDFLAGS="C:/sleef/lib/sleef.lib C:/sleef/lib/sleefquad.lib $LDFLAGS"
           CIBW_REPAIR_WHEEL_COMMAND: 'delvewheel repair -w {dest_dir} {wheel} --add-path C:\sleef\bin'
           CIBW_TEST_COMMAND: |
             pip install {package}[test]
@@ -216,39 +322,4 @@ jobs:
       - name: Publish to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
-          packages-dir: dist/*
-
-  # With the current setup, we are not creating a release on GitHub.
-  # create_release:
-  #   name: Create Release
-  #   needs: [build_wheels_linux, build_wheels_macos, build_wheels_windows]
-  #   runs-on: ubuntu-latest
-  #   if: startsWith(github.ref, 'refs/tags/quaddtype-v')
-
-  #   steps:
-  #     - name: Checkout code
-  #       uses: actions/checkout@v2
-
-  #     - name: Download all workflow run artifacts
-  #       uses: actions/download-artifact@v4
-  #       with:
-  #         path: artifacts
-
-  #     - name: Create Release
-  #       id: create_release
-  #       uses: actions/create-release@v1
-  #       env:
-  #         GITHUB_TOKEN: ${{ secrets.QUADDTYPE_GITHUB_TOKEN }}
-  #       with:
-  #         tag_name: ${{ github.ref }}
-  #         release_name: Release ${{ github.ref }}
-  #         draft: false
-  #         prerelease: false
-
-  #     - name: Upload Release Assets
-  #       uses: softprops/action-gh-release@v1
-  #       if: startsWith(github.ref, 'refs/tags/')
-  #       with:
-  #         files: ./artifacts/**/*.whl
-  #       env:
-  #         GITHUB_TOKEN: ${{ secrets.QUADDTYPE_GITHUB_TOKEN }}
+          packages-dir: dist/*
\ No newline at end of file
diff --git a/quaddtype/README.md b/quaddtype/README.md
index 614a3c2..af4ddef 100644
--- a/quaddtype/README.md
+++ b/quaddtype/README.md
@@ -53,7 +53,10 @@ source temp/bin/activate
 # Install the package
 pip install meson-python numpy pytest
 
-export LDFLAGS="-Wl,-rpath,$SLEEF_DIR/lib"
+export LDFLAGS="-Wl,-rpath,$SLEEF_DIR/lib -fopenmp -latomic -lpthread"
+export CFLAGS="-fPIC"
+export CXXFLAGS="-fPIC"
+
 python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v'
 
 # Run the tests
diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS
index 3d74bc5..4d81606 160000
--- a/quaddtype/numpy_quaddtype/QBLAS
+++ b/quaddtype/numpy_quaddtype/QBLAS
@@ -1 +1 @@
-Subproject commit 3d74bc5038d7b44f50b1c1cbd292da9e1b853b56
+Subproject commit 4d81606463d67c6a2d3efa4c647cbb95b21ddbc4
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index c08320b..b5cee77 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -17,178 +17,170 @@ extern "C" {
 #include "quad_common.h"
 #include "quadblas_interface.h"
 
-// Include QuadBLAS header
 #include "../QBLAS/include/quadblas/quadblas.hpp"
 
-// Helper function to get QuadBLAS layout from numpy array
-static QuadBLAS::Layout get_quadblas_layout(PyArrayObject *arr) {
+static QuadBLAS::Layout
+get_quadblas_layout(PyArrayObject *arr)
+{
     if (PyArray_IS_C_CONTIGUOUS(arr)) {
         return QuadBLAS::Layout::RowMajor;
-    } else {
+    }
+    else {
         return QuadBLAS::Layout::ColMajor;
     }
 }
 
-// Helper function to extract quad data and backend info from QuadPrecDType array
-static bool extract_quad_array_info(PyArrayObject *arr, Sleef_quad **data, 
-                                   QuadBackendType *backend, QuadBLAS::Layout *layout) {
+static bool
+extract_quad_array_info(PyArrayObject *arr, Sleef_quad **data, QuadBackendType *backend,
+                        QuadBLAS::Layout *layout)
+{
     if (!PyArray_Check(arr)) {
         PyErr_SetString(PyExc_TypeError, "Expected numpy array");
         return false;
     }
-    
+
     PyArray_Descr *descr = PyArray_DESCR(arr);
-    if (!PyObject_TypeCheck(descr, (PyTypeObject*)&QuadPrecDType)) {
+    if (!PyObject_TypeCheck(descr, (PyTypeObject *)&QuadPrecDType)) {
         PyErr_SetString(PyExc_TypeError, "Array must have QuadPrecDType dtype");
         return false;
     }
-    
-    QuadPrecDTypeObject *quad_descr = (QuadPrecDTypeObject*)descr;
+
+    QuadPrecDTypeObject *quad_descr = (QuadPrecDTypeObject *)descr;
     *backend = quad_descr->backend;
-    *data = (Sleef_quad*)PyArray_DATA(arr);
+    *data = (Sleef_quad *)PyArray_DATA(arr);
     *layout = get_quadblas_layout(arr);
-    
+
     return true;
 }
 
-// Helper function to convert between backends if needed
-static Sleef_quad* ensure_sleef_backend(PyArrayObject *arr, QuadBackendType original_backend, 
-                                       Sleef_quad **temp_storage) {
+static Sleef_quad *
+ensure_sleef_backend(PyArrayObject *arr, QuadBackendType original_backend,
+                     Sleef_quad **temp_storage)
+{
     if (original_backend == BACKEND_SLEEF) {
         *temp_storage = nullptr;
-        return (Sleef_quad*)PyArray_DATA(arr);
+        return (Sleef_quad *)PyArray_DATA(arr);
     }
-    
-    // Need to convert from longdouble to sleef
+
     npy_intp size = PyArray_SIZE(arr);
     *temp_storage = QuadBLAS::aligned_alloc<Sleef_quad>(size);
     if (!*temp_storage) {
         PyErr_NoMemory();
         return nullptr;
     }
-    
-    long double *ld_data = (long double*)PyArray_DATA(arr);
+
+    long double *ld_data = (long double *)PyArray_DATA(arr);
     for (npy_intp i = 0; i < size; i++) {
         (*temp_storage)[i] = Sleef_cast_from_doubleq1((double)ld_data[i]);
     }
-    
+
     return *temp_storage;
 }
 
-// Vector-Vector dot product
-static PyObject* dot_vector_vector(PyArrayObject *a, PyArrayObject *b) {
-    // Validate dimensions
+static PyObject *
+dot_vector_vector(PyArrayObject *a, PyArrayObject *b)
+{
     if (PyArray_NDIM(a) != 1 || PyArray_NDIM(b) != 1) {
         PyErr_SetString(PyExc_ValueError, "Both inputs must be 1-dimensional arrays");
         return nullptr;
     }
-    
+
     npy_intp n_a = PyArray_DIM(a, 0);
     npy_intp n_b = PyArray_DIM(b, 0);
-    
+
     if (n_a != n_b) {
         PyErr_SetString(PyExc_ValueError, "Arrays must have the same length");
         return nullptr;
     }
-    
-    // Extract data and backend info
+
     Sleef_quad *data_a, *data_b;
     QuadBackendType backend_a, backend_b;
     QuadBLAS::Layout layout_a, layout_b;
-    
+
     if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
         !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
         return nullptr;
     }
-    
-    // Convert to SLEEF backend if needed (QuadBLAS uses SLEEF internally)
+
     Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
     Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
     Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
-    
+
     if (!sleef_a || !sleef_b) {
         QuadBLAS::aligned_free(temp_a);
         QuadBLAS::aligned_free(temp_b);
         return nullptr;
     }
-    
-    // Get strides in terms of elements (not bytes)
+
     npy_intp stride_a = PyArray_STRIDE(a, 0) / PyArray_ITEMSIZE(a);
     npy_intp stride_b = PyArray_STRIDE(b, 0) / PyArray_ITEMSIZE(b);
-    
-    // Perform dot product using QuadBLAS
+
     Sleef_quad result = QuadBLAS::dot(n_a, sleef_a, stride_a, sleef_b, stride_b);
-    
-    // Clean up temporary storage
+
     QuadBLAS::aligned_free(temp_a);
     QuadBLAS::aligned_free(temp_b);
-    
-    // Determine result backend (prefer SLEEF, fall back to common backend)
+
     QuadBackendType result_backend = BACKEND_SLEEF;
     if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
         result_backend = BACKEND_LONGDOUBLE;
     }
-    
-    // Create result scalar
+
     QuadPrecisionObject *result_obj = QuadPrecision_raw_new(result_backend);
     if (!result_obj) {
         return nullptr;
     }
-    
+
     if (result_backend == BACKEND_SLEEF) {
         result_obj->value.sleef_value = result;
-    } else {
+    }
+    else {
         result_obj->value.longdouble_value = (long double)Sleef_cast_to_doubleq1(result);
     }
-    
-    return (PyObject*)result_obj;
+
+    return (PyObject *)result_obj;
 }
 
-// Matrix-Vector multiplication
-static PyObject* dot_matrix_vector(PyArrayObject *a, PyArrayObject *b) {
-    // Validate dimensions
+static PyObject *
+dot_matrix_vector(PyArrayObject *a, PyArrayObject *b)
+{
     if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 1) {
         PyErr_SetString(PyExc_ValueError, "First input must be 2D, second input must be 1D");
         return nullptr;
     }
-    
+
     npy_intp m = PyArray_DIM(a, 0);
     npy_intp n = PyArray_DIM(a, 1);
     npy_intp n_b = PyArray_DIM(b, 0);
-    
+
     if (n != n_b) {
         PyErr_SetString(PyExc_ValueError, "Matrix columns must match vector length");
         return nullptr;
     }
-    
-    // Extract data and backend info
+
     Sleef_quad *data_a, *data_b;
     QuadBackendType backend_a, backend_b;
     QuadBLAS::Layout layout_a, layout_b;
-    
+
     if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
         !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
         return nullptr;
     }
-    
-    // Convert to SLEEF backend if needed
+
     Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
     Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
     Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
-    
+
     if (!sleef_a || !sleef_b) {
         QuadBLAS::aligned_free(temp_a);
         QuadBLAS::aligned_free(temp_b);
         return nullptr;
     }
-    
-    // Determine result backend
+
     QuadBackendType result_backend = BACKEND_SLEEF;
     if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
         result_backend = BACKEND_LONGDOUBLE;
     }
-    
-    // Create result array (1D with length m)
+
     npy_intp result_dims[1] = {m};
     QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
     if (!result_dtype) {
@@ -196,98 +188,90 @@ static PyObject* dot_matrix_vector(PyArrayObject *a, PyArrayObject *b) {
         QuadBLAS::aligned_free(temp_b);
         return nullptr;
     }
-    
-    PyArrayObject *result = (PyArrayObject*)PyArray_Empty(1, result_dims, 
-                                                         (PyArray_Descr*)result_dtype, 0);
+
+    PyArrayObject *result =
+            (PyArrayObject *)PyArray_Empty(1, result_dims, (PyArray_Descr *)result_dtype, 0);
     if (!result) {
         QuadBLAS::aligned_free(temp_a);
         QuadBLAS::aligned_free(temp_b);
         Py_DECREF(result_dtype);
         return nullptr;
     }
-    
-    Sleef_quad *result_data = (Sleef_quad*)PyArray_DATA(result);
-    
-    // FIXED: Calculate leading dimensions and strides correctly
+
+    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
+
     npy_intp lda;
     if (layout_a == QuadBLAS::Layout::RowMajor) {
-        lda = n;  // For row-major, leading dimension is number of columns
-    } else {
-        lda = m;  // For column-major, leading dimension is number of rows
+        lda = n;
+    }
+    else {
+        lda = m;
     }
-    
+
     npy_intp stride_b = PyArray_STRIDE(b, 0) / PyArray_ITEMSIZE(b);
     npy_intp stride_result = PyArray_STRIDE(result, 0) / PyArray_ITEMSIZE(result);
-    
-    // Perform matrix-vector multiplication using QuadBLAS
-    // y = 1.0 * A * x + 0.0 * y
+
     Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
     Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
-    
-    QuadBLAS::gemv(layout_a, m, n, alpha, sleef_a, lda, 
-                   sleef_b, stride_b, beta, result_data, stride_result);
-    
-    // Convert result back to longdouble if needed
+
+    QuadBLAS::gemv(layout_a, m, n, alpha, sleef_a, lda, sleef_b, stride_b, beta, result_data,
+                   stride_result);
+
     if (result_backend == BACKEND_LONGDOUBLE) {
-        long double *ld_result = (long double*)PyArray_DATA(result);
+        long double *ld_result = (long double *)PyArray_DATA(result);
         for (npy_intp i = 0; i < m; i++) {
             ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
         }
     }
-    
-    // Clean up temporary storage
+
     QuadBLAS::aligned_free(temp_a);
     QuadBLAS::aligned_free(temp_b);
-    
-    return (PyObject*)result;
+
+    return (PyObject *)result;
 }
 
-// Matrix-Matrix multiplication
-static PyObject* dot_matrix_matrix(PyArrayObject *a, PyArrayObject *b) {
-    // Validate dimensions
+static PyObject *
+dot_matrix_matrix(PyArrayObject *a, PyArrayObject *b)
+{
     if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 2) {
         PyErr_SetString(PyExc_ValueError, "Both inputs must be 2-dimensional arrays");
         return nullptr;
     }
-    
+
     npy_intp m = PyArray_DIM(a, 0);
     npy_intp k = PyArray_DIM(a, 1);
     npy_intp k_b = PyArray_DIM(b, 0);
     npy_intp n = PyArray_DIM(b, 1);
-    
+
     if (k != k_b) {
         PyErr_SetString(PyExc_ValueError, "Matrix inner dimensions must match");
         return nullptr;
     }
-    
-    // Extract data and backend info
+
     Sleef_quad *data_a, *data_b;
     QuadBackendType backend_a, backend_b;
     QuadBLAS::Layout layout_a, layout_b;
-    
+
     if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
         !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
         return nullptr;
     }
-    
-    // Convert to SLEEF backend if needed
+
     Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
     Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
     Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
-    
+
     if (!sleef_a || !sleef_b) {
         QuadBLAS::aligned_free(temp_a);
         QuadBLAS::aligned_free(temp_b);
         return nullptr;
     }
-    
-    // Determine result backend
+
     QuadBackendType result_backend = BACKEND_SLEEF;
     if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
         result_backend = BACKEND_LONGDOUBLE;
     }
-    
-    // Create result array (2D with shape m x n)
+
     npy_intp result_dims[2] = {m, n};
     QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
     if (!result_dtype) {
@@ -295,133 +279,135 @@ static PyObject* dot_matrix_matrix(PyArrayObject *a, PyArrayObject *b) {
         QuadBLAS::aligned_free(temp_b);
         return nullptr;
     }
-    
-    PyArrayObject *result = (PyArrayObject*)PyArray_Empty(2, result_dims, 
-                                                         (PyArray_Descr*)result_dtype, 0);
+
+    PyArrayObject *result =
+            (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
     if (!result) {
         QuadBLAS::aligned_free(temp_a);
         QuadBLAS::aligned_free(temp_b);
         Py_DECREF(result_dtype);
         return nullptr;
     }
-    
-    Sleef_quad *result_data = (Sleef_quad*)PyArray_DATA(result);
-    
-    // FIXED: Calculate leading dimensions correctly
+
+    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
+
     npy_intp lda, ldb, ldc;
-    
+
     if (layout_a == QuadBLAS::Layout::RowMajor) {
-        lda = k;  // For row-major A: leading dimension is number of columns
-    } else {
-        lda = m;  // For column-major A: leading dimension is number of rows
+        lda = k;
+    }
+    else {
+        lda = m;
     }
-    
+
     if (layout_b == QuadBLAS::Layout::RowMajor) {
-        ldb = n;  // For row-major B: leading dimension is number of columns
-    } else {
-        ldb = k;  // For column-major B: leading dimension is number of rows
+        ldb = n;
+    }
+    else {
+        ldb = k;
     }
-    
-    // Result array layout - assume same as input A
+
     QuadBLAS::Layout result_layout = layout_a;
     if (result_layout == QuadBLAS::Layout::RowMajor) {
-        ldc = n;  // For row-major C: leading dimension is number of columns
-    } else {
-        ldc = m;  // For column-major C: leading dimension is number of rows
+        ldc = n;
+    }
+    else {
+        ldc = m;
     }
-    
-    // Perform matrix-matrix multiplication using QuadBLAS
-    // C = 1.0 * A * B + 0.0 * C
+
     Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
     Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
-    
-    QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, 
-                   sleef_b, ldb, beta, result_data, ldc);
-    
-    // Convert result back to longdouble if needed
+
+    QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data,
+                   ldc);
+
     if (result_backend == BACKEND_LONGDOUBLE) {
-        long double *ld_result = (long double*)PyArray_DATA(result);
+        long double *ld_result = (long double *)PyArray_DATA(result);
         for (npy_intp i = 0; i < m * n; i++) {
             ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
         }
     }
-    
-    // Clean up temporary storage
+
     QuadBLAS::aligned_free(temp_a);
     QuadBLAS::aligned_free(temp_b);
-    
-    return (PyObject*)result;
+
+    return (PyObject *)result;
 }
 
-// Main dot function that dispatches based on input dimensions
-PyObject* py_quadblas_dot(PyObject* self, PyObject* args) {
+PyObject *
+py_quadblas_dot(PyObject *self, PyObject *args)
+{
     PyObject *a_obj, *b_obj;
-    
+
     if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
         return nullptr;
     }
-    
-    // Convert to arrays if needed
-    PyArrayObject *a = (PyArrayObject*)PyArray_FROM_OF(a_obj, NPY_ARRAY_ALIGNED);
-    PyArrayObject *b = (PyArrayObject*)PyArray_FROM_OF(b_obj, NPY_ARRAY_ALIGNED);
-    
+
+    PyArrayObject *a = (PyArrayObject *)PyArray_FROM_OF(a_obj, NPY_ARRAY_ALIGNED);
+    PyArrayObject *b = (PyArrayObject *)PyArray_FROM_OF(b_obj, NPY_ARRAY_ALIGNED);
+
     if (!a || !b) {
         Py_XDECREF(a);
         Py_XDECREF(b);
         PyErr_SetString(PyExc_TypeError, "Inputs must be convertible to arrays");
         return nullptr;
     }
-    
+
     PyObject *result = nullptr;
-    
-    // Dispatch based on dimensions
+
     int ndim_a = PyArray_NDIM(a);
     int ndim_b = PyArray_NDIM(b);
-    
+
     if (ndim_a == 1 && ndim_b == 1) {
-        // Vector-Vector dot product
         result = dot_vector_vector(a, b);
-    } else if (ndim_a == 2 && ndim_b == 1) {
-        // Matrix-Vector multiplication
+    }
+    else if (ndim_a == 2 && ndim_b == 1) {
         result = dot_matrix_vector(a, b);
-    } else if (ndim_a == 2 && ndim_b == 2) {
-        // Matrix-Matrix multiplication
+    }
+    else if (ndim_a == 2 && ndim_b == 2) {
         result = dot_matrix_matrix(a, b);
-    } else if (ndim_a == 1 && ndim_b == 2) {
-        PyErr_SetString(PyExc_ValueError, 
-            "Vector-Matrix multiplication not supported (use Matrix-Vector instead)");
-    } else {
-        PyErr_SetString(PyExc_ValueError, 
-            "Unsupported array dimensions. Supported: (1D,1D), (2D,1D), (2D,2D)");
-    }
-    
+    }
+    else if (ndim_a == 1 && ndim_b == 2) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Vector-Matrix multiplication not supported (use Matrix-Vector instead)");
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError,
+                        "Unsupported array dimensions. Supported: (1D,1D), (2D,1D), (2D,2D)");
+    }
+
     Py_DECREF(a);
     Py_DECREF(b);
-    
+
     return result;
 }
 
-// Threading control functions
-PyObject* py_quadblas_set_num_threads(PyObject* self, PyObject* args) {
+PyObject *
+py_quadblas_set_num_threads(PyObject *self, PyObject *args)
+{
     int num_threads;
-    
+
     if (!PyArg_ParseTuple(args, "i", &num_threads)) {
         return nullptr;
     }
-    
+
     if (num_threads < 1) {
         PyErr_SetString(PyExc_ValueError, "Number of threads must be positive");
         return nullptr;
     }
-    
+
     QuadBLAS::set_num_threads(num_threads);
     Py_RETURN_NONE;
 }
 
-PyObject* py_quadblas_get_num_threads(PyObject* self, PyObject* args) {
+PyObject *
+py_quadblas_get_num_threads(PyObject *self, PyObject *args)
+{
     return PyLong_FromLong(QuadBLAS::get_num_threads());
 }
 
-PyObject* py_quadblas_get_version(PyObject* self, PyObject* args) {
+PyObject *
+py_quadblas_get_version(PyObject *self, PyObject *args)
+{
     return PyUnicode_FromString(QuadBLAS::VERSION);
 }
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.h b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
index 1bdfdfb..da8f0a8 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.h
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
@@ -7,21 +7,17 @@ extern "C" {
 
 #include <Python.h>
 
-// Main dot function that handles vector-vector, matrix-vector, and matrix-matrix operations
-// Renamed to avoid conflicts with QuadBLAS C interface
+
 PyObject* py_quadblas_dot(PyObject* self, PyObject* args);
 
-// Threading control functions
-// Renamed to avoid conflicts with QuadBLAS C interface
+
 PyObject* py_quadblas_set_num_threads(PyObject* self, PyObject* args);
 PyObject* py_quadblas_get_num_threads(PyObject* self, PyObject* args);
 
-// Version information
-// Renamed to avoid conflicts with QuadBLAS C interface
 PyObject* py_quadblas_get_version(PyObject* self, PyObject* args);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif // _QUADDTYPE_QUADBLAS_INTERFACE_H
\ No newline at end of file
+#endif 
\ No newline at end of file
diff --git a/quaddtype/tests/test_dot.py b/quaddtype/tests/test_dot.py
new file mode 100644
index 0000000..ed135f4
--- /dev/null
+++ b/quaddtype/tests/test_dot.py
@@ -0,0 +1,377 @@
+"""
+Focused test suite for the dot function in numpy_quaddtype
+
+This module tests the QuadBLAS dot function for:
+- Vector-vector dot products
+- Matrix-vector multiplication  
+- Matrix-matrix multiplication
+- Small and large matrix operations
+- Basic correctness validation
+
+Uses only the Sleef backend for simplicity.
+"""
+
+import pytest
+import numpy as np
+from numpy_quaddtype import QuadPrecision, QuadPrecDType, dot
+
+
+# ================================================================================
+# UTILITIES
+# ================================================================================
+
+def assert_quad_equal(a, b, rtol=1e-15, atol=1e-15):
+    """Assert two quad precision values are equal within tolerance"""
+    # Ensure both operands are QuadPrecision objects for the comparison
+    if not isinstance(a, QuadPrecision):
+        a = QuadPrecision(str(a), backend='sleef')
+    if not isinstance(b, QuadPrecision):
+        b = QuadPrecision(str(b), backend='sleef')
+
+    # Use quad-precision arithmetic to calculate the difference
+    diff = abs(a - b)
+    tolerance = QuadPrecision(str(atol), backend='sleef') + QuadPrecision(str(rtol), backend='sleef') * max(abs(a), abs(b))
+    
+    # Assert using quad-precision objects
+    assert diff <= tolerance, f"Values not equal: {a} != {b} (diff: {diff}, tol: {tolerance})"
+
+
+def assert_quad_array_equal(a, b, rtol=1e-25, atol=1e-25):
+    """Assert two quad precision arrays are equal within tolerance"""
+    assert a.shape == b.shape, f"Shapes don't match: {a.shape} vs {b.shape}"
+    
+    flat_a = a.flatten()
+    flat_b = b.flatten()
+    
+    for i, (val_a, val_b) in enumerate(zip(flat_a, flat_b)):
+        try:
+            assert_quad_equal(val_a, val_b, rtol, atol)
+        except AssertionError as e:
+            raise AssertionError(f"Arrays differ at index {i}: {e}")
+
+
+def create_quad_array(values, shape=None):
+    """Create a QuadPrecision array from values using Sleef backend"""
+    dtype = QuadPrecDType(backend='sleef')
+    
+    if isinstance(values, (list, tuple)):
+        if shape is None:
+            # 1D array
+            quad_values = [QuadPrecision(str(float(v)), backend='sleef') for v in values]
+            return np.array(quad_values, dtype=dtype)
+        else:
+            # Reshape to specified shape
+            if len(shape) == 1:
+                quad_values = [QuadPrecision(str(float(v)), backend='sleef') for v in values]
+                return np.array(quad_values, dtype=dtype)
+            elif len(shape) == 2:
+                m, n = shape
+                assert len(values) == m * n, f"Values length {len(values)} doesn't match shape {shape}"
+                quad_matrix = []
+                for i in range(m):
+                    row = [QuadPrecision(str(float(values[i * n + j])), backend='sleef') for j in range(n)]
+                    quad_matrix.append(row)
+                return np.array(quad_matrix, dtype=dtype)
+    
+    raise ValueError("Unsupported values or shape")
+
+
+# ================================================================================
+# VECTOR-VECTOR DOT PRODUCT TESTS
+# ================================================================================
+
+class TestVectorVectorDot:
+    """Test vector-vector dot products"""
+    
+    def test_simple_dot_product(self):
+        """Test basic vector dot product"""
+        x = create_quad_array([1, 2, 3])
+        y = create_quad_array([4, 5, 6])
+        
+        result = dot(x, y)
+        expected = 1*4 + 2*5 + 3*6  # = 32
+        
+        assert isinstance(result, QuadPrecision)
+        assert_quad_equal(result, expected)
+    
+    def test_orthogonal_vectors(self):
+        """Test orthogonal vectors (should give zero)"""
+        x = create_quad_array([1, 0, 0])
+        y = create_quad_array([0, 1, 0])
+        
+        result = dot(x, y)
+        assert_quad_equal(result, 0.0)
+    
+    def test_same_vector(self):
+        """Test dot product of vector with itself"""
+        x = create_quad_array([2, 3, 4])
+        
+        result = dot(x, x)
+        expected = 2*2 + 3*3 + 4*4  # = 29
+        
+        assert_quad_equal(result, expected)
+    
+    @pytest.mark.parametrize("size", [1, 2, 5, 10, 50, 100])
+    def test_various_vector_sizes(self, size):
+        """Test different vector sizes from small to large"""
+        # Create vectors with known pattern
+        x_vals = [i + 1 for i in range(size)]  # [1, 2, 3, ...]
+        y_vals = [2 * (i + 1) for i in range(size)]  # [2, 4, 6, ...]
+        
+        x = create_quad_array(x_vals)
+        y = create_quad_array(y_vals)
+        
+        result = dot(x, y)
+        expected = sum(x_vals[i] * y_vals[i] for i in range(size))
+        
+        assert_quad_equal(result, expected)
+    
+    def test_negative_and_fractional_values(self):
+        """Test vectors with negative and fractional values"""
+        x = create_quad_array([1.5, -2.5, 3.25])
+        y = create_quad_array([-1.25, 2.75, -3.5])
+        
+        result = dot(x, y)
+        expected = 1.5*(-1.25) + (-2.5)*2.75 + 3.25*(-3.5)
+        
+        assert_quad_equal(result, expected)
+
+
+# ================================================================================
+# MATRIX-VECTOR MULTIPLICATION TESTS  
+# ================================================================================
+
+class TestMatrixVectorDot:
+    """Test matrix-vector multiplication"""
+    
+    def test_simple_matrix_vector(self):
+        """Test basic matrix-vector multiplication"""
+        # 2x3 matrix
+        A = create_quad_array([1, 2, 3, 4, 5, 6], shape=(2, 3))
+        # 3x1 vector  
+        x = create_quad_array([1, 1, 1])
+        
+        result = dot(A, x)
+        expected = [1+2+3, 4+5+6]  # [6, 15]
+        
+        assert result.shape == (2,)
+        for i in range(2):
+            assert_quad_equal(result[i], expected[i])
+    
+    def test_identity_matrix_vector(self):
+        """Test multiplication with identity matrix"""
+        # 3x3 identity matrix
+        I = create_quad_array([1, 0, 0, 0, 1, 0, 0, 0, 1], shape=(3, 3))
+        x = create_quad_array([2, 3, 4])
+        
+        result = dot(I, x)
+        
+        assert result.shape == (3,)
+        for i in range(3):
+            assert_quad_equal(result[i], float(x[i]))
+    
+    @pytest.mark.parametrize("m,n", [(2,3), (3,2), (5,4), (10,8), (20,15)])
+    def test_various_matrix_vector_sizes(self, m, n):
+        """Test various matrix-vector sizes from small to large"""
+        # Create m×n matrix with sequential values
+        A_vals = [(i*n + j + 1) for i in range(m) for j in range(n)]
+        A = create_quad_array(A_vals, shape=(m, n))
+        
+        # Create n×1 vector with simple values
+        x_vals = [i + 1 for i in range(n)]
+        x = create_quad_array(x_vals)
+        
+        result = dot(A, x)
+        
+        assert result.shape == (m,)
+        
+        # Verify manually for small matrices
+        if m <= 5 and n <= 5:
+            for i in range(m):
+                expected = sum(A_vals[i*n + j] * x_vals[j] for j in range(n))
+                assert_quad_equal(result[i], expected)
+
+
+# ================================================================================
+# MATRIX-MATRIX MULTIPLICATION TESTS
+# ================================================================================
+
+class TestMatrixMatrixDot:
+    """Test matrix-matrix multiplication"""
+    
+    def test_simple_matrix_matrix(self):
+        """Test basic matrix-matrix multiplication"""
+        # 2x2 matrices
+        A = create_quad_array([1, 2, 3, 4], shape=(2, 2))
+        B = create_quad_array([5, 6, 7, 8], shape=(2, 2))
+        
+        result = dot(A, B)
+        
+        # Expected: [[1*5+2*7, 1*6+2*8], [3*5+4*7, 3*6+4*8]] = [[19, 22], [43, 50]]
+        expected = [[19, 22], [43, 50]]
+        
+        assert result.shape == (2, 2)
+        for i in range(2):
+            for j in range(2):
+                assert_quad_equal(result[i, j], expected[i][j])
+    
+    def test_identity_matrix_multiplication(self):
+        """Test multiplication with identity matrix"""
+        A = create_quad_array([1, 2, 3, 4], shape=(2, 2))
+        I = create_quad_array([1, 0, 0, 1], shape=(2, 2))
+        
+        # A * I should equal A
+        result1 = dot(A, I)
+        assert_quad_array_equal(result1, A)
+        
+        # I * A should equal A  
+        result2 = dot(I, A)
+        assert_quad_array_equal(result2, A)
+    
+    @pytest.mark.parametrize("m,n,k", [(2,2,2), (2,3,4), (3,2,5), (4,4,4), (5,6,7)])
+    def test_various_matrix_sizes(self, m, n, k):
+        """Test various matrix sizes: (m×k) × (k×n) = (m×n)"""
+        # Create A: m×k matrix
+        A_vals = [(i*k + j + 1) for i in range(m) for j in range(k)]
+        A = create_quad_array(A_vals, shape=(m, k))
+        
+        # Create B: k×n matrix  
+        B_vals = [(i*n + j + 1) for i in range(k) for j in range(n)]
+        B = create_quad_array(B_vals, shape=(k, n))
+        
+        result = dot(A, B)
+        
+        assert result.shape == (m, n)
+        
+        # Verify manually for small matrices
+        if m <= 3 and n <= 3 and k <= 3:
+            for i in range(m):
+                for j in range(n):
+                    expected = sum(A_vals[i*k + l] * B_vals[l*n + j] for l in range(k))
+                    assert_quad_equal(result[i, j], expected)
+    
+    def test_associativity(self):
+        """Test matrix multiplication associativity: (A*B)*C = A*(B*C)"""
+        # Use small 2x2 matrices for simplicity
+        A = create_quad_array([1, 2, 3, 4], shape=(2, 2))
+        B = create_quad_array([2, 1, 1, 2], shape=(2, 2))
+        C = create_quad_array([1, 1, 2, 1], shape=(2, 2))
+        
+        # Compute (A*B)*C
+        AB = dot(A, B)
+        result1 = dot(AB, C)
+        
+        # Compute A*(B*C)
+        BC = dot(B, C)
+        result2 = dot(A, BC)
+        
+        assert_quad_array_equal(result1, result2, rtol=1e-25)
+
+
+# ================================================================================
+# LARGE MATRIX TESTS
+# ================================================================================
+
+class TestLargeMatrices:
+    """Test performance and correctness with larger matrices"""
+    
+    @pytest.mark.parametrize("size", [50, 100, 200])
+    def test_large_square_matrices(self, size):
+        """Test large square matrix multiplication"""
+        # Create matrices with simple pattern for verification
+        A_vals = [1.0 if i == j else 0.1 for i in range(size) for j in range(size)]  # Near-diagonal
+        B_vals = [1.0] * (size * size)  # All ones
+        
+        A = create_quad_array(A_vals, shape=(size, size))
+        B = create_quad_array(B_vals, shape=(size, size))
+        
+        result = dot(A, B)
+        
+        assert result.shape == (size, size)
+        
+        # Each element = sum of a row in A = 1.0 + 0.1*(size-1)
+        expected_value = 1.0 + 0.1 * (size - 1)
+        
+        # Check diagonal and off-diagonal elements
+        assert_quad_equal(result[0, 0], expected_value, rtol=1e-15, atol=1e-15)
+        if size > 1:
+            assert_quad_equal(result[0, 1], expected_value, rtol=1e-15, atol=1e-15)
+        
+        # Additional verification: check a few more elements
+        if size > 2:
+            assert_quad_equal(result[1, 0], expected_value, rtol=1e-15, atol=1e-15)
+            assert_quad_equal(result[size//2, size//2], expected_value, rtol=1e-15, atol=1e-15)
+    
+    def test_large_vector_operations(self):
+        """Test large vector dot products"""
+        size = 1000
+        
+        # Create vectors with known sum
+        x_vals = [1.0] * size
+        y_vals = [2.0] * size
+        
+        x = create_quad_array(x_vals)
+        y = create_quad_array(y_vals)
+        
+        result = dot(x, y)
+        expected = size * 1.0 * 2.0  # = 2000.0
+        
+        assert_quad_equal(result, expected)
+    
+    def test_rectangular_large_matrices(self):
+        """Test large rectangular matrix operations"""
+        m, n, k = 100, 80, 120
+        
+        # Create simple patterns
+        A_vals = [(i + j + 1) % 10 for i in range(m) for j in range(k)]
+        B_vals = [(i + j + 1) % 10 for i in range(k) for j in range(n)]
+        
+        A = create_quad_array(A_vals, shape=(m, k))
+        B = create_quad_array(B_vals, shape=(k, n))
+        
+        result = dot(A, B)
+        
+        assert result.shape == (m, n)
+        
+        # Verify that result doesn't contain NaN or inf
+        result_flat = result.flatten()
+        for i in range(min(10, len(result_flat))):  # Check first few elements
+            val = float(result_flat[i])
+            assert not np.isnan(val), f"NaN found at position {i}"
+            assert not np.isinf(val), f"Inf found at position {i}"
+
+
+# ================================================================================
+# BASIC ERROR HANDLING
+# ================================================================================
+
+class TestBasicErrorHandling:
+    """Test basic error conditions"""
+    
+    def test_dimension_mismatch_vectors(self):
+        """Test dimension mismatch in vectors"""
+        x = create_quad_array([1, 2])
+        y = create_quad_array([1, 2, 3])
+        
+        with pytest.raises(ValueError, match="same length"):
+            dot(x, y)
+    
+    def test_dimension_mismatch_matrix_vector(self):
+        """Test dimension mismatch in matrix-vector"""
+        A = create_quad_array([1, 2, 3, 4], shape=(2, 2))
+        x = create_quad_array([1, 2, 3])  # Wrong size
+        
+        with pytest.raises(ValueError, match="columns must match"):
+            dot(A, x)
+    
+    def test_dimension_mismatch_matrices(self):
+        """Test dimension mismatch in matrix-matrix"""
+        A = create_quad_array([1, 2, 3, 4], shape=(2, 2))
+        B = create_quad_array([1, 2, 3, 4, 5, 6], shape=(3, 2))  # Wrong size
+        
+        with pytest.raises(ValueError, match="Matrix inner dimensions must match"):
+            dot(A, B)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
\ No newline at end of file

From ca7dd6d84f93a9d74823b72205ef57e5227613d1 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 9 Jul 2025 19:22:18 +0000
Subject: [PATCH 04/49] fixing ci

---
 .github/workflows/build_wheels.yml | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index d8114a1..3b8ae86 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -34,8 +34,8 @@ jobs:
 
       - name: Verify QuadBLAS submodule
         run: |
-          ls -la QuadBLAS/
-          ls -la QuadBLAS/include/quadblas/
+          ls -la quaddtype/numpy_quaddtype/QBLAS/
+          ls -la quaddtype/numpy_quaddtype/QBLAS/include/quadblas/
 
       - name: Install SLEEF and setup QuadBLAS
         run: |
@@ -51,7 +51,7 @@ jobs:
           cd ..
           
           # Install QuadBLAS headers from submodule (header-only)
-          sudo cp -r QuadBLAS/include/quadblas /usr/local/include/
+          sudo cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas /usr/local/include/
           
           # Verify installation
           ls -la /usr/local/include/quadblas/
@@ -78,7 +78,7 @@ jobs:
             cd ..
             
             # Install QuadBLAS headers from submodule
-            cp -r QuadBLAS/include/quadblas /usr/local/include/
+            cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas /usr/local/include/
           CIBW_ENVIRONMENT: >
             CFLAGS="-I/usr/local/include -I/usr/local/include/quadblas $CFLAGS"
             CXXFLAGS="-I/usr/local/include -I/usr/local/include/quadblas -std=c++17 $CXXFLAGS"
@@ -123,8 +123,8 @@ jobs:
 
       - name: Verify QuadBLAS submodule
         run: |
-          ls -la QuadBLAS/
-          ls -la QuadBLAS/include/quadblas/
+          ls -la quaddtype/numpy_quaddtype/QBLAS/
+          ls -la quaddtype/numpy_quaddtype/QBLAS/include/quadblas/
 
       - name: Install SLEEF and setup QuadBLAS
         env:
@@ -145,7 +145,7 @@ jobs:
           cd ..
           
           # Install QuadBLAS headers from submodule
-          sudo cp -r QuadBLAS/include/quadblas /usr/local/include/
+          sudo cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas /usr/local/include/
 
       - name: Install cibuildwheel
         run: pip install cibuildwheel==2.20.0
@@ -171,7 +171,7 @@ jobs:
             cd ..
             
             # Install QuadBLAS headers from submodule
-            cp -r QuadBLAS/include/quadblas /usr/local/include/
+            cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas /usr/local/include/
           CIBW_ENVIRONMENT: >
             MACOSX_DEPLOYMENT_TARGET="11.0"
             DYLD_LIBRARY_PATH="/usr/local/lib:$DYLD_LIBRARY_PATH"
@@ -223,8 +223,8 @@ jobs:
       - name: Verify QuadBLAS submodule
         shell: pwsh
         run: |
-          Get-ChildItem QuadBLAS/
-          Get-ChildItem QuadBLAS/include/quadblas/
+          Get-ChildItem quaddtype/numpy_quaddtype/QBLAS/
+          Get-ChildItem quaddtype/numpy_quaddtype/QBLAS/include/quadblas/
 
       - name: Clone and Build SLEEF
         shell: pwsh
@@ -239,7 +239,7 @@ jobs:
         shell: pwsh
         run: |
           New-Item -ItemType Directory -Force -Path "C:/quadblas/include"
-          Copy-Item -Recurse -Force "QuadBLAS/include/quadblas" "C:/quadblas/include/"
+          Copy-Item -Recurse -Force "quaddtype/numpy_quaddtype/QBLAS/include/quadblas" "C:/quadblas/include/"
 
       - name: Setup build environment
         shell: pwsh
@@ -273,7 +273,7 @@ jobs:
             cmake --install build --prefix "C:/sleef" --config Release
             cd ..
             mkdir -p C:/quadblas/include
-            cp -r QuadBLAS/include/quadblas C:/quadblas/include/
+            cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas C:/quadblas/include/
           CIBW_BEFORE_BUILD: |
             pip install meson meson-python ninja numpy
           CIBW_ENVIRONMENT: >

From 04314e33995197a780dc5986ab393bf9638a1a10 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 9 Jul 2025 19:23:46 +0000
Subject: [PATCH 05/49] fixing ci

---
 .github/workflows/build_wheels.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 3b8ae86..1b0b5a6 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -9,8 +9,8 @@ on:
     paths:
       - "quaddtype/**"
   pull_request:
-    paths:
-      - "quaddtype/**"
+    # paths:
+    #   - "quaddtype/**"
   workflow_dispatch:
 
 jobs:

From 037021afce2790b3e85b531d9f4c0f6c42c29920 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 9 Jul 2025 19:24:31 +0000
Subject: [PATCH 06/49] fixing ci

---
 .github/workflows/build_wheels.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 1b0b5a6..f0c0ade 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -6,8 +6,8 @@ on:
       - dot
     tags:
       - "quaddtype-v*"
-    paths:
-      - "quaddtype/**"
+    # paths:
+    #   - "quaddtype/**"
   pull_request:
     # paths:
     #   - "quaddtype/**"

From d6fc9c6a5cb56e11315411d07d45ed2fe3405a80 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Thu, 10 Jul 2025 12:21:21 +0530
Subject: [PATCH 07/49] fixing linux CI

---
 .github/workflows/build_wheels.yml | 441 ++++++++++++++---------------
 1 file changed, 210 insertions(+), 231 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index f0c0ade..3f2b9e9 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -37,26 +37,6 @@ jobs:
           ls -la quaddtype/numpy_quaddtype/QBLAS/
           ls -la quaddtype/numpy_quaddtype/QBLAS/include/quadblas/
 
-      - name: Install SLEEF and setup QuadBLAS
-        run: |
-          # Install SLEEF
-          git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
-          cd sleef
-          cmake -S . -B build \
-            -DSLEEF_BUILD_QUAD:BOOL=ON \
-            -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON \
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-          cmake --build build/ --clean-first -j
-          sudo cmake --install build --prefix /usr/local
-          cd ..
-          
-          # Install QuadBLAS headers from submodule (header-only)
-          sudo cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas /usr/local/include/
-          
-          # Verify installation
-          ls -la /usr/local/include/quadblas/
-          ls -la /usr/local/lib/
-
       - name: Install cibuildwheel
         run: pip install cibuildwheel==2.20.0
 
@@ -66,7 +46,8 @@ jobs:
           CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
           CIBW_BUILD_VERBOSITY: "3"
           CIBW_BEFORE_ALL: |
-            # Install SLEEF
+            # Install SLEEF in container
+            yum install -y cmake git gcc gcc-c++ make
             git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
             cd sleef
             cmake -S . -B build \
@@ -76,13 +57,11 @@ jobs:
             cmake --build build/ --clean-first -j
             cmake --install build --prefix /usr/local
             cd ..
-            
-            # Install QuadBLAS headers from submodule
-            cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas /usr/local/include/
+            rm -rf sleef
           CIBW_ENVIRONMENT: >
-            CFLAGS="-I/usr/local/include -I/usr/local/include/quadblas $CFLAGS"
-            CXXFLAGS="-I/usr/local/include -I/usr/local/include/quadblas -std=c++17 $CXXFLAGS"
-            LDFLAGS="-L/usr/local/lib64 -Wl,-rpath,/usr/local/lib64 -lsleef -lsleefquad -fopenmp -latomic -lpthread $LDFLAGS"
+            CFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CFLAGS"
+            CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -std=c++17 $CXXFLAGS"
+            LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -lsleef -lsleefquad -fopenmp $LDFLAGS"
             LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
             PKG_CONFIG_PATH="/usr/local/lib64/pkgconfig:/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH"
           CIBW_REPAIR_WHEEL_COMMAND: |
@@ -100,226 +79,226 @@ jobs:
           path: ./quaddtype/wheelhouse/*.whl
           name: wheels-linux
 
-  build_wheels_macos:
-    name: Build wheels on ${{ matrix.os }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [macos-13, macos-14]
+  # build_wheels_macos:
+  #   name: Build wheels on ${{ matrix.os }}
+  #   runs-on: ${{ matrix.os }}
+  #   strategy:
+  #     matrix:
+  #       os: [macos-13, macos-14]
 
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: recursive
+  #   steps:
+  #     - uses: actions/checkout@v3
+  #       with:
+  #         submodules: recursive
 
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.10"
+  #     - name: Set up Python
+  #       uses: actions/setup-python@v4
+  #       with:
+  #         python-version: "3.10"
 
-      - name: Install dependencies
-        run: |
-          brew install cmake libomp git
+  #     - name: Install dependencies
+  #       run: |
+  #         brew install cmake libomp git
 
-      - name: Verify QuadBLAS submodule
-        run: |
-          ls -la quaddtype/numpy_quaddtype/QBLAS/
-          ls -la quaddtype/numpy_quaddtype/QBLAS/include/quadblas/
+  #     - name: Verify QuadBLAS submodule
+  #       run: |
+  #         ls -la quaddtype/numpy_quaddtype/QBLAS/
+  #         ls -la quaddtype/numpy_quaddtype/QBLAS/include/quadblas/
 
-      - name: Install SLEEF and setup QuadBLAS
-        env:
-          MACOSX_DEPLOYMENT_TARGET: "11.0"
-        run: |
-          # Install SLEEF
-          git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
-          cd sleef
-          cmake -S . -B build \
-            -DSLEEF_BUILD_QUAD:BOOL=ON \
-            -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON \
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \
-            -DCMAKE_INSTALL_RPATH="@loader_path/../lib" \
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
-          cmake --build build/ --clean-first -j
-          sudo cmake --install build --prefix /usr/local
-          cd ..
-          
-          # Install QuadBLAS headers from submodule
-          sudo cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas /usr/local/include/
+  #     - name: Install SLEEF and setup QuadBLAS
+  #       env:
+  #         MACOSX_DEPLOYMENT_TARGET: "11.0"
+  #       run: |
+  #         # Install SLEEF
+  #         git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
+  #         cd sleef
+  #         cmake -S . -B build \
+  #           -DSLEEF_BUILD_QUAD:BOOL=ON \
+  #           -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON \
+  #           -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+  #           -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \
+  #           -DCMAKE_INSTALL_RPATH="@loader_path/../lib" \
+  #           -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
+  #         cmake --build build/ --clean-first -j
+  #         sudo cmake --install build --prefix /usr/local
+  #         cd ..
 
-      - name: Install cibuildwheel
-        run: pip install cibuildwheel==2.20.0
+  #         # Install QuadBLAS headers from submodule
+  #         sudo cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas /usr/local/include/
 
-      - name: Build wheels
-        env:
-          CIBW_BUILD: "cp310-* cp311-* cp312-*"
-          CIBW_ARCHS_MACOS: ${{ matrix.os == 'macos-13' && 'x86_64' || 'arm64' }}
-          CIBW_BUILD_VERBOSITY: "1"
-          CIBW_BEFORE_ALL: |
-            # Install SLEEF
-            git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
-            cd sleef
-            cmake -S . -B build \
-              -DSLEEF_BUILD_QUAD:BOOL=ON \
-              -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON \
-              -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-              -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \
-              -DCMAKE_INSTALL_RPATH="@loader_path/../lib" \
-              -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
-            cmake --build build/ --clean-first -j
-            cmake --install build --prefix /usr/local
-            cd ..
-            
-            # Install QuadBLAS headers from submodule
-            cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas /usr/local/include/
-          CIBW_ENVIRONMENT: >
-            MACOSX_DEPLOYMENT_TARGET="11.0"
-            DYLD_LIBRARY_PATH="/usr/local/lib:$DYLD_LIBRARY_PATH"
-            CFLAGS="-I/usr/local/include -I/usr/local/include/quadblas $CFLAGS"
-            CXXFLAGS="-I/usr/local/include -I/usr/local/include/quadblas -std=c++17 $CXXFLAGS"
-            LDFLAGS="-L/usr/local/lib -lsleef -lsleefquad $LDFLAGS"
-            PKG_CONFIG_PATH="/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH"
-          CIBW_REPAIR_WHEEL_COMMAND: >
-            delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}
-          CIBW_TEST_COMMAND: |
-            pip install {package}[test]
-            pytest {project}/tests
-          CIBW_TEST_EXTRAS: "test"
-        run: |
-          python -m cibuildwheel --output-dir wheelhouse
-        working-directory: ./quaddtype
+  #     - name: Install cibuildwheel
+  #       run: pip install cibuildwheel==2.20.0
 
-      - uses: actions/upload-artifact@v4
-        with:
-          path: ./quaddtype/wheelhouse/*.whl
-          name: wheels-${{ matrix.os }}
+  #     - name: Build wheels
+  #       env:
+  #         CIBW_BUILD: "cp310-* cp311-* cp312-*"
+  #         CIBW_ARCHS_MACOS: ${{ matrix.os == 'macos-13' && 'x86_64' || 'arm64' }}
+  #         CIBW_BUILD_VERBOSITY: "1"
+  #         CIBW_BEFORE_ALL: |
+  #           # Install SLEEF
+  #           git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
+  #           cd sleef
+  #           cmake -S . -B build \
+  #             -DSLEEF_BUILD_QUAD:BOOL=ON \
+  #             -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON \
+  #             -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+  #             -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \
+  #             -DCMAKE_INSTALL_RPATH="@loader_path/../lib" \
+  #             -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
+  #           cmake --build build/ --clean-first -j
+  #           cmake --install build --prefix /usr/local
+  #           cd ..
 
-  build_wheels_windows:
-    name: Build wheels on Windows
-    runs-on: windows-latest
-    strategy:
-      matrix:
-        architecture: [x64]
+  #           # Install QuadBLAS headers from submodule
+  #           cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas /usr/local/include/
+  #         CIBW_ENVIRONMENT: >
+  #           MACOSX_DEPLOYMENT_TARGET="11.0"
+  #           DYLD_LIBRARY_PATH="/usr/local/lib:$DYLD_LIBRARY_PATH"
+  #           CFLAGS="-I/usr/local/include -I/usr/local/include/quadblas $CFLAGS"
+  #           CXXFLAGS="-I/usr/local/include -I/usr/local/include/quadblas -std=c++17 $CXXFLAGS"
+  #           LDFLAGS="-L/usr/local/lib -lsleef -lsleefquad $LDFLAGS"
+  #           PKG_CONFIG_PATH="/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH"
+  #         CIBW_REPAIR_WHEEL_COMMAND: >
+  #           delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}
+  #         CIBW_TEST_COMMAND: |
+  #           pip install {package}[test]
+  #           pytest {project}/tests
+  #         CIBW_TEST_EXTRAS: "test"
+  #       run: |
+  #         python -m cibuildwheel --output-dir wheelhouse
+  #       working-directory: ./quaddtype
 
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: recursive
+  #     - uses: actions/upload-artifact@v4
+  #       with:
+  #         path: ./quaddtype/wheelhouse/*.whl
+  #         name: wheels-${{ matrix.os }}
 
-      - name: Setup MSVC
-        uses: ilammy/msvc-dev-cmd@v1
-        with:
-          arch: ${{ matrix.architecture }}
+  # build_wheels_windows:
+  #   name: Build wheels on Windows
+  #   runs-on: windows-latest
+  #   strategy:
+  #     matrix:
+  #       architecture: [x64]
 
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.10"
-          architecture: ${{ matrix.architecture }}
+  #   steps:
+  #     - uses: actions/checkout@v3
+  #       with:
+  #         submodules: recursive
 
-      - name: Install CMake
-        uses: lukka/get-cmake@latest
+  #     - name: Setup MSVC
+  #       uses: ilammy/msvc-dev-cmd@v1
+  #       with:
+  #         arch: ${{ matrix.architecture }}
 
-      - name: Verify QuadBLAS submodule
-        shell: pwsh
-        run: |
-          Get-ChildItem quaddtype/numpy_quaddtype/QBLAS/
-          Get-ChildItem quaddtype/numpy_quaddtype/QBLAS/include/quadblas/
+  #     - name: Set up Python 3.10
+  #       uses: actions/setup-python@v4
+  #       with:
+  #         python-version: "3.10"
+  #         architecture: ${{ matrix.architecture }}
 
-      - name: Clone and Build SLEEF
-        shell: pwsh
-        run: |
-          git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
-          cd sleef
-          cmake -S . -B build -G "Visual Studio 17 2022" -A ${{ matrix.architecture == 'x86' && 'Win32' || 'x64' }} -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-          cmake --build build --config Release
-          cmake --install build --prefix "C:/sleef" --config Release
-
-      - name: Install QuadBLAS headers from submodule
-        shell: pwsh
-        run: |
-          New-Item -ItemType Directory -Force -Path "C:/quadblas/include"
-          Copy-Item -Recurse -Force "quaddtype/numpy_quaddtype/QBLAS/include/quadblas" "C:/quadblas/include/"
+  #     - name: Install CMake
+  #       uses: lukka/get-cmake@latest
 
-      - name: Setup build environment
-        shell: pwsh
-        run: |
-          $env:INCLUDE += ";C:\sleef\include;C:\quadblas\include"
-          $env:LIB += ";C:\sleef\lib"
-          $env:PATH = "C:\sleef\bin;$env:PATH"
-          echo "INCLUDE=$env:INCLUDE" >> $env:GITHUB_ENV
-          echo "LIB=$env:LIB" >> $env:GITHUB_ENV
-          echo "PATH=$env:PATH" >> $env:GITHUB_ENV
-
-      - name: Install build dependencies
-        shell: bash -l {0}
-        run: |
-          pip install -U pip
-          pip install cibuildwheel==2.20.0 ninja meson meson-python numpy delvewheel pytest
+  #     - name: Verify QuadBLAS submodule
+  #       shell: pwsh
+  #       run: |
+  #         Get-ChildItem quaddtype/numpy_quaddtype/QBLAS/
+  #         Get-ChildItem quaddtype/numpy_quaddtype/QBLAS/include/quadblas/
 
-      - name: Build wheels
-        env:
-          CIBW_BUILD: "cp310-* cp311-* cp312-*"
-          CIBW_SKIP: "pp* cp36-* cp37-* cp38-* cp39-* cp313-*"
-          CIBW_ARCHS_WINDOWS: ${{ matrix.architecture == 'x86' && 'x86' || 'AMD64' }}
-          CIBW_BUILD_VERBOSITY: "3"
-          DISTUTILS_USE_SDK: "1"
-          MSSdk: "1"
-          CIBW_BEFORE_ALL: |
-            git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
-            cd sleef
-            cmake -S . -B build -G "Visual Studio 17 2022" -A ${{ matrix.architecture == 'x86' && 'Win32' || 'x64' }} -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            cmake --build build --config Release
-            cmake --install build --prefix "C:/sleef" --config Release
-            cd ..
-            mkdir -p C:/quadblas/include
-            cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas C:/quadblas/include/
-          CIBW_BEFORE_BUILD: |
-            pip install meson meson-python ninja numpy
-          CIBW_ENVIRONMENT: >
-            INCLUDE="C:/sleef/include;C:/quadblas/include;$INCLUDE"
-            LIB="C:/sleef/lib;$LIB"
-            PATH="C:/sleef/bin;$PATH"
-            CFLAGS="/IC:/sleef/include /IC:/quadblas/include $CFLAGS"
-            CXXFLAGS="/IC:/sleef/include /IC:/quadblas/include /std:c++17 $CXXFLAGS"
-            LDFLAGS="C:/sleef/lib/sleef.lib C:/sleef/lib/sleefquad.lib $LDFLAGS"
-          CIBW_REPAIR_WHEEL_COMMAND: 'delvewheel repair -w {dest_dir} {wheel} --add-path C:\sleef\bin'
-          CIBW_TEST_COMMAND: |
-            pip install {package}[test]
-            python -m pytest -v {project}/test
-          CIBW_TEST_EXTRAS: test
-          CIBW_TEST_FAIL_FAST: 1
-        shell: pwsh
-        run: |
-          python -m cibuildwheel --output-dir wheelhouse
-          if (-not (Test-Path wheelhouse/*.whl)) { throw "Wheel was not created" }
-        working-directory: ./quaddtype
+  #     - name: Clone and Build SLEEF
+  #       shell: pwsh
+  #       run: |
+  #         git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
+  #         cd sleef
+  #         cmake -S . -B build -G "Visual Studio 17 2022" -A ${{ matrix.architecture == 'x86' && 'Win32' || 'x64' }} -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+  #         cmake --build build --config Release
+  #         cmake --install build --prefix "C:/sleef" --config Release
 
-      - uses: actions/upload-artifact@v4
-        with:
-          path: ./quaddtype/wheelhouse/*.whl
-          name: wheels-windows-${{ matrix.architecture }}
+  #     - name: Install QuadBLAS headers from submodule
+  #       shell: pwsh
+  #       run: |
+  #         New-Item -ItemType Directory -Force -Path "C:/quadblas/include"
+  #         Copy-Item -Recurse -Force "quaddtype/numpy_quaddtype/QBLAS/include/quadblas" "C:/quadblas/include/"
 
-  publish_to_pypi:
-    name: Publish to PyPI
-    needs: [build_wheels_linux, build_wheels_macos, build_wheels_windows]
-    runs-on: ubuntu-latest
-    if: startsWith(github.ref, 'refs/tags/quaddtype-v')
-    
-    environment:
-      name: quadtype_release
-      url: https://pypi.org/p/numpy-quaddtype
-    
-    permissions:
-      id-token: write  # IMPORTANT: mandatory for trusted publishing
-    
-    steps:
-      - name: Download all workflow run artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: dist
-      
-      - name: Publish to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          packages-dir: dist/*
\ No newline at end of file
+  #     - name: Setup build environment
+  #       shell: pwsh
+  #       run: |
+  #         $env:INCLUDE += ";C:\sleef\include;C:\quadblas\include"
+  #         $env:LIB += ";C:\sleef\lib"
+  #         $env:PATH = "C:\sleef\bin;$env:PATH"
+  #         echo "INCLUDE=$env:INCLUDE" >> $env:GITHUB_ENV
+  #         echo "LIB=$env:LIB" >> $env:GITHUB_ENV
+  #         echo "PATH=$env:PATH" >> $env:GITHUB_ENV
+
+  #     - name: Install build dependencies
+  #       shell: bash -l {0}
+  #       run: |
+  #         pip install -U pip
+  #         pip install cibuildwheel==2.20.0 ninja meson meson-python numpy delvewheel pytest
+
+  #     - name: Build wheels
+  #       env:
+  #         CIBW_BUILD: "cp310-* cp311-* cp312-*"
+  #         CIBW_SKIP: "pp* cp36-* cp37-* cp38-* cp39-* cp313-*"
+  #         CIBW_ARCHS_WINDOWS: ${{ matrix.architecture == 'x86' && 'x86' || 'AMD64' }}
+  #         CIBW_BUILD_VERBOSITY: "3"
+  #         DISTUTILS_USE_SDK: "1"
+  #         MSSdk: "1"
+  #         CIBW_BEFORE_ALL: |
+  #           git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
+  #           cd sleef
+  #           cmake -S . -B build -G "Visual Studio 17 2022" -A ${{ matrix.architecture == 'x86' && 'Win32' || 'x64' }} -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+  #           cmake --build build --config Release
+  #           cmake --install build --prefix "C:/sleef" --config Release
+  #           cd ..
+  #           mkdir -p C:/quadblas/include
+  #           cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas C:/quadblas/include/
+  #         CIBW_BEFORE_BUILD: |
+  #           pip install meson meson-python ninja numpy
+  #         CIBW_ENVIRONMENT: >
+  #           INCLUDE="C:/sleef/include;C:/quadblas/include;$INCLUDE"
+  #           LIB="C:/sleef/lib;$LIB"
+  #           PATH="C:/sleef/bin;$PATH"
+  #           CFLAGS="/IC:/sleef/include /IC:/quadblas/include $CFLAGS"
+  #           CXXFLAGS="/IC:/sleef/include /IC:/quadblas/include /std:c++17 $CXXFLAGS"
+  #           LDFLAGS="C:/sleef/lib/sleef.lib C:/sleef/lib/sleefquad.lib $LDFLAGS"
+  #         CIBW_REPAIR_WHEEL_COMMAND: 'delvewheel repair -w {dest_dir} {wheel} --add-path C:\sleef\bin'
+  #         CIBW_TEST_COMMAND: |
+  #           pip install {package}[test]
+  #           python -m pytest -v {project}/test
+  #         CIBW_TEST_EXTRAS: test
+  #         CIBW_TEST_FAIL_FAST: 1
+  #       shell: pwsh
+  #       run: |
+  #         python -m cibuildwheel --output-dir wheelhouse
+  #         if (-not (Test-Path wheelhouse/*.whl)) { throw "Wheel was not created" }
+  #       working-directory: ./quaddtype
+
+  #     - uses: actions/upload-artifact@v4
+  #       with:
+  #         path: ./quaddtype/wheelhouse/*.whl
+  #         name: wheels-windows-${{ matrix.architecture }}
+
+  # publish_to_pypi:
+  #   name: Publish to PyPI
+  #   needs: [build_wheels_linux, build_wheels_macos, build_wheels_windows]
+  #   runs-on: ubuntu-latest
+  #   if: startsWith(github.ref, 'refs/tags/quaddtype-v')
+
+  #   environment:
+  #     name: quadtype_release
+  #     url: https://pypi.org/p/numpy-quaddtype
+
+  #   permissions:
+  #     id-token: write  # IMPORTANT: mandatory for trusted publishing
+
+  #   steps:
+  #     - name: Download all workflow run artifacts
+  #       uses: actions/download-artifact@v4
+  #       with:
+  #         path: dist
+
+  #     - name: Publish to PyPI
+  #       uses: pypa/gh-action-pypi-publish@release/v1
+  #       with:
+  #         packages-dir: dist/*

From f99f56567a3ef34df44f676e72149505f34f26d8 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Thu, 10 Jul 2025 12:28:31 +0530
Subject: [PATCH 08/49] fixing linux CI

---
 .github/workflows/build_wheels.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 3f2b9e9..e5fe39c 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -47,8 +47,7 @@ jobs:
           CIBW_BUILD_VERBOSITY: "3"
           CIBW_BEFORE_ALL: |
             # Install SLEEF in container
-            yum install -y cmake git gcc gcc-c++ make
-            git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
+            git clone --branch 3.8 https://github.com/shibatch/sleef.git
             cd sleef
             cmake -S . -B build \
               -DSLEEF_BUILD_QUAD:BOOL=ON \
@@ -56,8 +55,6 @@ jobs:
               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
             cmake --build build/ --clean-first -j
             cmake --install build --prefix /usr/local
-            cd ..
-            rm -rf sleef
           CIBW_ENVIRONMENT: >
             CFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CFLAGS"
             CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -std=c++17 $CXXFLAGS"

From fb3579ce1cf37cb860765c183124cc548f234e00 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Thu, 10 Jul 2025 12:32:35 +0530
Subject: [PATCH 09/49] fixing linux CI

---
 .github/workflows/build_wheels.yml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index e5fe39c..fd09ca5 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -27,11 +27,6 @@ jobs:
         with:
           python-version: ">=3.10.0"
 
-      - name: Install dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y cmake build-essential libomp-dev git pkg-config
-
       - name: Verify QuadBLAS submodule
         run: |
           ls -la quaddtype/numpy_quaddtype/QBLAS/
@@ -46,6 +41,8 @@ jobs:
           CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
           CIBW_BUILD_VERBOSITY: "3"
           CIBW_BEFORE_ALL: |
+            sudo apt-get update
+            sudo apt-get install -y cmake build-essential libomp-dev git pkg-config gcc g++
             # Install SLEEF in container
             git clone --branch 3.8 https://github.com/shibatch/sleef.git
             cd sleef

From 03e9acd2aee74a9473c1a4fc6e77e9b6f7e20e35 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Thu, 10 Jul 2025 12:34:01 +0530
Subject: [PATCH 10/49] fixing linux CI

---
 .github/workflows/build_wheels.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index fd09ca5..ad5bad5 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -41,8 +41,8 @@ jobs:
           CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
           CIBW_BUILD_VERBOSITY: "3"
           CIBW_BEFORE_ALL: |
-            sudo apt-get update
-            sudo apt-get install -y cmake build-essential libomp-dev git pkg-config gcc g++
+            apt-get update
+            apt-get install -y cmake build-essential libomp-dev git pkg-config gcc g++
             # Install SLEEF in container
             git clone --branch 3.8 https://github.com/shibatch/sleef.git
             cd sleef

From 1ed7babff356b2f140f2e09c4ce43449ba20a1ee Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Thu, 10 Jul 2025 12:37:41 +0530
Subject: [PATCH 11/49] fixing linux CI

---
 .github/workflows/build_wheels.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index ad5bad5..c323ea4 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -41,8 +41,8 @@ jobs:
           CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
           CIBW_BUILD_VERBOSITY: "3"
           CIBW_BEFORE_ALL: |
-            apt-get update
-            apt-get install -y cmake build-essential libomp-dev git pkg-config gcc g++
+            yum update -y
+            yum install -y cmake gcc gcc-c++ make git pkgconfig
             # Install SLEEF in container
             git clone --branch 3.8 https://github.com/shibatch/sleef.git
             cd sleef

From 63a355e9b47510399edcb45d4b0170e201902004 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Thu, 10 Jul 2025 12:41:54 +0530
Subject: [PATCH 12/49] fixing linux CI

---
 .github/workflows/build_wheels.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index c323ea4..47efecb 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -55,7 +55,7 @@ jobs:
           CIBW_ENVIRONMENT: >
             CFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CFLAGS"
             CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -std=c++17 $CXXFLAGS"
-            LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -lsleef -lsleefquad -fopenmp $LDFLAGS"
+            LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp $LDFLAGS"
             LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
             PKG_CONFIG_PATH="/usr/local/lib64/pkgconfig:/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH"
           CIBW_REPAIR_WHEEL_COMMAND: |

From 88a98d1ad8fbbbb996c6fb39da8241d68466055d Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Thu, 10 Jul 2025 18:47:57 +0530
Subject: [PATCH 13/49] fixing linux CI

---
 .github/workflows/build_wheels.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 47efecb..e4672e9 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -54,7 +54,7 @@ jobs:
             cmake --install build --prefix /usr/local
           CIBW_ENVIRONMENT: >
             CFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CFLAGS"
-            CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -std=c++17 $CXXFLAGS"
+            CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -fext-numeric-literals $CXXFLAGS"
             LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp $LDFLAGS"
             LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
             PKG_CONFIG_PATH="/usr/local/lib64/pkgconfig:/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH"

From 764fc722552657d07b9e90d41f33a3f1318d08c8 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Thu, 10 Jul 2025 23:45:17 +0530
Subject: [PATCH 14/49] updating qblas:

---
 quaddtype/numpy_quaddtype/QBLAS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS
index 4d81606..4853ac1 160000
--- a/quaddtype/numpy_quaddtype/QBLAS
+++ b/quaddtype/numpy_quaddtype/QBLAS
@@ -1 +1 @@
-Subproject commit 4d81606463d67c6a2d3efa4c647cbb95b21ddbc4
+Subproject commit 4853ac1c7d3fa3016b61e9f2b9a43f49c06d891d

From 1669e5f49eeed90c3a6b1c5b948a4036a6211e58 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Thu, 10 Jul 2025 23:55:49 +0530
Subject: [PATCH 15/49] fixing macos CI

---
 .github/workflows/build_wheels.yml | 148 ++++++++++++-----------------
 1 file changed, 63 insertions(+), 85 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index e4672e9..e09ed84 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -73,99 +73,77 @@ jobs:
           path: ./quaddtype/wheelhouse/*.whl
           name: wheels-linux
 
-  # build_wheels_macos:
-  #   name: Build wheels on ${{ matrix.os }}
-  #   runs-on: ${{ matrix.os }}
-  #   strategy:
-  #     matrix:
-  #       os: [macos-13, macos-14]
+  build_wheels_macos:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [macos-13, macos-14]
 
-  #   steps:
-  #     - uses: actions/checkout@v3
-  #       with:
-  #         submodules: recursive
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: recursive
 
-  #     - name: Set up Python
-  #       uses: actions/setup-python@v4
-  #       with:
-  #         python-version: "3.10"
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
 
-  #     - name: Install dependencies
-  #       run: |
-  #         brew install cmake libomp git
+      - name: Install dependencies
+        run: |
+          brew install cmake libomp git
 
-  #     - name: Verify QuadBLAS submodule
-  #       run: |
-  #         ls -la quaddtype/numpy_quaddtype/QBLAS/
-  #         ls -la quaddtype/numpy_quaddtype/QBLAS/include/quadblas/
+      - name: Install SLEEF
+        env:
+          MACOSX_DEPLOYMENT_TARGET: "11.0"
+        run: |
+          git clone --branch 3.8 https://github.com/shibatch/sleef.git
+          cd sleef
+          cmake -S . -B build \
+            -DSLEEF_BUILD_QUAD:BOOL=ON \
+            -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON \
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \
+            -DCMAKE_INSTALL_RPATH="@loader_path/../lib" \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
+          cmake --build build/ --clean-first -j
+          sudo cmake --install build --prefix /usr/local
 
-  #     - name: Install SLEEF and setup QuadBLAS
-  #       env:
-  #         MACOSX_DEPLOYMENT_TARGET: "11.0"
-  #       run: |
-  #         # Install SLEEF
-  #         git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
-  #         cd sleef
-  #         cmake -S . -B build \
-  #           -DSLEEF_BUILD_QUAD:BOOL=ON \
-  #           -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON \
-  #           -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #           -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \
-  #           -DCMAKE_INSTALL_RPATH="@loader_path/../lib" \
-  #           -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
-  #         cmake --build build/ --clean-first -j
-  #         sudo cmake --install build --prefix /usr/local
-  #         cd ..
-
-  #         # Install QuadBLAS headers from submodule
-  #         sudo cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas /usr/local/include/
-
-  #     - name: Install cibuildwheel
-  #       run: pip install cibuildwheel==2.20.0
+      - name: Verify QuadBLAS submodule
+        run: |
+          ls -la quaddtype/numpy_quaddtype/QBLAS/
+          ls -la quaddtype/numpy_quaddtype/QBLAS/include/quadblas/
 
-  #     - name: Build wheels
-  #       env:
-  #         CIBW_BUILD: "cp310-* cp311-* cp312-*"
-  #         CIBW_ARCHS_MACOS: ${{ matrix.os == 'macos-13' && 'x86_64' || 'arm64' }}
-  #         CIBW_BUILD_VERBOSITY: "1"
-  #         CIBW_BEFORE_ALL: |
-  #           # Install SLEEF
-  #           git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
-  #           cd sleef
-  #           cmake -S . -B build \
-  #             -DSLEEF_BUILD_QUAD:BOOL=ON \
-  #             -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON \
-  #             -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #             -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \
-  #             -DCMAKE_INSTALL_RPATH="@loader_path/../lib" \
-  #             -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
-  #           cmake --build build/ --clean-first -j
-  #           cmake --install build --prefix /usr/local
-  #           cd ..
+      - name: Install cibuildwheel
+        run: pip install cibuildwheel==2.20.0
 
-  #           # Install QuadBLAS headers from submodule
-  #           cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas /usr/local/include/
-  #         CIBW_ENVIRONMENT: >
-  #           MACOSX_DEPLOYMENT_TARGET="11.0"
-  #           DYLD_LIBRARY_PATH="/usr/local/lib:$DYLD_LIBRARY_PATH"
-  #           CFLAGS="-I/usr/local/include -I/usr/local/include/quadblas $CFLAGS"
-  #           CXXFLAGS="-I/usr/local/include -I/usr/local/include/quadblas -std=c++17 $CXXFLAGS"
-  #           LDFLAGS="-L/usr/local/lib -lsleef -lsleefquad $LDFLAGS"
-  #           PKG_CONFIG_PATH="/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH"
-  #         CIBW_REPAIR_WHEEL_COMMAND: >
-  #           delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}
-  #         CIBW_TEST_COMMAND: |
-  #           pip install {package}[test]
-  #           pytest {project}/tests
-  #         CIBW_TEST_EXTRAS: "test"
-  #       run: |
-  #         python -m cibuildwheel --output-dir wheelhouse
-  #       working-directory: ./quaddtype
+      - name: Build wheels
+        env:
+          CIBW_BUILD: "cp310-* cp311-* cp312-*"
+          CIBW_ARCHS_MACOS: ${{ matrix.os == 'macos-13' && 'x86_64' || 'arm64' }}
+          CIBW_BUILD_VERBOSITY: "3"
+          CIBW_ENVIRONMENT: >
+            MACOSX_DEPLOYMENT_TARGET="11.0"
+            DYLD_LIBRARY_PATH="/usr/local/lib:$DYLD_LIBRARY_PATH"
+            CFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CFLAGS"
+            CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CXXFLAGS"
+            LDFLAGS="-L/usr/local/lib $LDFLAGS"
+            PKG_CONFIG_PATH="/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH"
+          CIBW_REPAIR_WHEEL_COMMAND: >
+            delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}
+          CIBW_TEST_COMMAND: |
+            pip install {package}[test]
+            pytest -s {project}/tests
+          CIBW_TEST_EXTRAS: "test"
+        run: |
+          python -m cibuildwheel --output-dir wheelhouse
+        working-directory: ./quaddtype
 
-  #     - uses: actions/upload-artifact@v4
-  #       with:
-  #         path: ./quaddtype/wheelhouse/*.whl
-  #         name: wheels-${{ matrix.os }}
+      - uses: actions/upload-artifact@v4
+        with:
+          path: ./quaddtype/wheelhouse/*.whl
+          name: wheels-${{ matrix.os }}
 
   # build_wheels_windows:
   #   name: Build wheels on Windows

From b35bac3d27c30220eac15da06f7fefaf805731d5 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Fri, 11 Jul 2025 00:07:51 +0530
Subject: [PATCH 16/49] bumping macos deployment target

---
 .github/workflows/build_wheels.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index e09ed84..d951321 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -96,7 +96,7 @@ jobs:
 
       - name: Install SLEEF
         env:
-          MACOSX_DEPLOYMENT_TARGET: "11.0"
+          MACOSX_DEPLOYMENT_TARGET: "14.0"
         run: |
           git clone --branch 3.8 https://github.com/shibatch/sleef.git
           cd sleef

From 042b25a245d0a5841f1a353cbe4a867ee82518b5 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Fri, 11 Jul 2025 00:11:21 +0530
Subject: [PATCH 17/49] bumping macos deployment target

---
 .github/workflows/build_wheels.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index d951321..6eec28d 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -104,7 +104,7 @@ jobs:
             -DSLEEF_BUILD_QUAD:BOOL=ON \
             -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON \
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
             -DCMAKE_INSTALL_RPATH="@loader_path/../lib" \
             -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
           cmake --build build/ --clean-first -j
@@ -124,7 +124,7 @@ jobs:
           CIBW_ARCHS_MACOS: ${{ matrix.os == 'macos-13' && 'x86_64' || 'arm64' }}
           CIBW_BUILD_VERBOSITY: "3"
           CIBW_ENVIRONMENT: >
-            MACOSX_DEPLOYMENT_TARGET="11.0"
+            MACOSX_DEPLOYMENT_TARGET="14.0"
             DYLD_LIBRARY_PATH="/usr/local/lib:$DYLD_LIBRARY_PATH"
             CFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CFLAGS"
             CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CXXFLAGS"

From f78dd90b35b32d26ddc3acce268aa09a1674fffb Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Fri, 11 Jul 2025 00:20:26 +0530
Subject: [PATCH 18/49] dynamic macos deployment target

---
 .github/workflows/build_wheels.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 6eec28d..aa82be8 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -96,7 +96,7 @@ jobs:
 
       - name: Install SLEEF
         env:
-          MACOSX_DEPLOYMENT_TARGET: "14.0"
+          MACOSX_DEPLOYMENT_TARGET: ${{ matrix.os == 'macos-13' && '13.0' || '14.0' }}
         run: |
           git clone --branch 3.8 https://github.com/shibatch/sleef.git
           cd sleef
@@ -104,7 +104,7 @@ jobs:
             -DSLEEF_BUILD_QUAD:BOOL=ON \
             -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON \
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=${{ matrix.os == 'macos-13' && '13.0' || '14.0' }} \
             -DCMAKE_INSTALL_RPATH="@loader_path/../lib" \
             -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
           cmake --build build/ --clean-first -j
@@ -124,7 +124,7 @@ jobs:
           CIBW_ARCHS_MACOS: ${{ matrix.os == 'macos-13' && 'x86_64' || 'arm64' }}
           CIBW_BUILD_VERBOSITY: "3"
           CIBW_ENVIRONMENT: >
-            MACOSX_DEPLOYMENT_TARGET="14.0"
+            MACOSX_DEPLOYMENT_TARGET="${{ matrix.os == 'macos-13' && '13.0' || '14.0' }}"
             DYLD_LIBRARY_PATH="/usr/local/lib:$DYLD_LIBRARY_PATH"
             CFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CFLAGS"
             CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CXXFLAGS"

From cd88de0d03d568bc60a4031756674b064ba2b8db Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Fri, 11 Jul 2025 00:43:35 +0530
Subject: [PATCH 19/49] explicit init of res array in dot-mat-mat

---
 quaddtype/numpy_quaddtype/src/quadblas_interface.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index b5cee77..555f854 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -290,6 +290,9 @@ dot_matrix_matrix(PyArrayObject *a, PyArrayObject *b)
     }
 
     Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
+    for (npy_intp i = 0; i < m * n; i++) {
+        result_data[i] = Sleef_cast_from_doubleq1(0.0);
+    }
 
     npy_intp lda, ldb, ldc;
 

From abf0224bf572972c5f8304e279b5e6785cc6f9e1 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Fri, 11 Jul 2025 00:55:24 +0530
Subject: [PATCH 20/49] fixing windows CI

---
 .github/workflows/build_wheels.yml | 167 ++++++++++++-----------------
 1 file changed, 71 insertions(+), 96 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index aa82be8..4e84dc8 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -145,111 +145,86 @@ jobs:
           path: ./quaddtype/wheelhouse/*.whl
           name: wheels-${{ matrix.os }}
 
-  # build_wheels_windows:
-  #   name: Build wheels on Windows
-  #   runs-on: windows-latest
-  #   strategy:
-  #     matrix:
-  #       architecture: [x64]
-
-  #   steps:
-  #     - uses: actions/checkout@v3
-  #       with:
-  #         submodules: recursive
-
-  #     - name: Setup MSVC
-  #       uses: ilammy/msvc-dev-cmd@v1
-  #       with:
-  #         arch: ${{ matrix.architecture }}
+  build_wheels_windows:
+    name: Build wheels on Windows
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        architecture: [x64]
 
-  #     - name: Set up Python 3.10
-  #       uses: actions/setup-python@v4
-  #       with:
-  #         python-version: "3.10"
-  #         architecture: ${{ matrix.architecture }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: recursive
 
-  #     - name: Install CMake
-  #       uses: lukka/get-cmake@latest
+      - name: Setup MSVC
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: ${{ matrix.architecture }}
 
-  #     - name: Verify QuadBLAS submodule
-  #       shell: pwsh
-  #       run: |
-  #         Get-ChildItem quaddtype/numpy_quaddtype/QBLAS/
-  #         Get-ChildItem quaddtype/numpy_quaddtype/QBLAS/include/quadblas/
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+          architecture: ${{ matrix.architecture }}
 
-  #     - name: Clone and Build SLEEF
-  #       shell: pwsh
-  #       run: |
-  #         git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
-  #         cd sleef
-  #         cmake -S . -B build -G "Visual Studio 17 2022" -A ${{ matrix.architecture == 'x86' && 'Win32' || 'x64' }} -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-  #         cmake --build build --config Release
-  #         cmake --install build --prefix "C:/sleef" --config Release
+      - name: Install CMake
+        uses: lukka/get-cmake@latest
 
-  #     - name: Install QuadBLAS headers from submodule
-  #       shell: pwsh
-  #       run: |
-  #         New-Item -ItemType Directory -Force -Path "C:/quadblas/include"
-  #         Copy-Item -Recurse -Force "quaddtype/numpy_quaddtype/QBLAS/include/quadblas" "C:/quadblas/include/"
+      - name: Verify QuadBLAS submodule
+        shell: pwsh
+        run: |
+          Get-ChildItem quaddtype/numpy_quaddtype/QBLAS/
+          Get-ChildItem quaddtype/numpy_quaddtype/QBLAS/include/quadblas/
 
-  #     - name: Setup build environment
-  #       shell: pwsh
-  #       run: |
-  #         $env:INCLUDE += ";C:\sleef\include;C:\quadblas\include"
-  #         $env:LIB += ";C:\sleef\lib"
-  #         $env:PATH = "C:\sleef\bin;$env:PATH"
-  #         echo "INCLUDE=$env:INCLUDE" >> $env:GITHUB_ENV
-  #         echo "LIB=$env:LIB" >> $env:GITHUB_ENV
-  #         echo "PATH=$env:PATH" >> $env:GITHUB_ENV
+      - name: Clone and Build SLEEF
+        shell: pwsh
+        run: |
+          git clone --branch 3.8 https://github.com/shibatch/sleef.git
+          cd sleef
+          cmake -S . -B build -G "Visual Studio 17 2022" -A ${{ matrix.architecture == 'x86' && 'Win32' || 'x64' }} -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+          cmake --build build --config Release
+          cmake --install build --prefix "C:/sleef" --config Release
 
-  #     - name: Install build dependencies
-  #       shell: bash -l {0}
-  #       run: |
-  #         pip install -U pip
-  #         pip install cibuildwheel==2.20.0 ninja meson meson-python numpy delvewheel pytest
+      - name: Install build dependencies
+        shell: bash -l {0}
+        run: |
+          pip install -U pip
+          pip install cibuildwheel==2.20.0 ninja meson meson-python numpy delvewheel pytest
 
-  #     - name: Build wheels
-  #       env:
-  #         CIBW_BUILD: "cp310-* cp311-* cp312-*"
-  #         CIBW_SKIP: "pp* cp36-* cp37-* cp38-* cp39-* cp313-*"
-  #         CIBW_ARCHS_WINDOWS: ${{ matrix.architecture == 'x86' && 'x86' || 'AMD64' }}
-  #         CIBW_BUILD_VERBOSITY: "3"
-  #         DISTUTILS_USE_SDK: "1"
-  #         MSSdk: "1"
-  #         CIBW_BEFORE_ALL: |
-  #           git clone --branch 3.8 --depth 1 https://github.com/shibatch/sleef.git
-  #           cd sleef
-  #           cmake -S . -B build -G "Visual Studio 17 2022" -A ${{ matrix.architecture == 'x86' && 'Win32' || 'x64' }} -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-  #           cmake --build build --config Release
-  #           cmake --install build --prefix "C:/sleef" --config Release
-  #           cd ..
-  #           mkdir -p C:/quadblas/include
-  #           cp -r quaddtype/numpy_quaddtype/QBLAS/include/quadblas C:/quadblas/include/
-  #         CIBW_BEFORE_BUILD: |
-  #           pip install meson meson-python ninja numpy
-  #         CIBW_ENVIRONMENT: >
-  #           INCLUDE="C:/sleef/include;C:/quadblas/include;$INCLUDE"
-  #           LIB="C:/sleef/lib;$LIB"
-  #           PATH="C:/sleef/bin;$PATH"
-  #           CFLAGS="/IC:/sleef/include /IC:/quadblas/include $CFLAGS"
-  #           CXXFLAGS="/IC:/sleef/include /IC:/quadblas/include /std:c++17 $CXXFLAGS"
-  #           LDFLAGS="C:/sleef/lib/sleef.lib C:/sleef/lib/sleefquad.lib $LDFLAGS"
-  #         CIBW_REPAIR_WHEEL_COMMAND: 'delvewheel repair -w {dest_dir} {wheel} --add-path C:\sleef\bin'
-  #         CIBW_TEST_COMMAND: |
-  #           pip install {package}[test]
-  #           python -m pytest -v {project}/test
-  #         CIBW_TEST_EXTRAS: test
-  #         CIBW_TEST_FAIL_FAST: 1
-  #       shell: pwsh
-  #       run: |
-  #         python -m cibuildwheel --output-dir wheelhouse
-  #         if (-not (Test-Path wheelhouse/*.whl)) { throw "Wheel was not created" }
-  #       working-directory: ./quaddtype
+      - name: Build wheels
+        env:
+          CIBW_BUILD: "cp310-* cp311-* cp312-*"
+          CIBW_SKIP: "pp* cp36-* cp37-* cp38-* cp39-* cp313-*"
+          CIBW_ARCHS_WINDOWS: ${{ matrix.architecture == 'x86' && 'x86' || 'AMD64' }}
+          CIBW_BUILD_VERBOSITY: "3"
+          DISTUTILS_USE_SDK: "1"
+          MSSdk: "1"
+          CIBW_BEFORE_BUILD: |
+            pip install meson meson-python ninja numpy
+          CIBW_ENVIRONMENT: >
+            INCLUDE="C:/sleef/include;{project}/numpy_quaddtype/QBLAS/include;$INCLUDE"
+            LIB="C:/sleef/lib;$LIB"
+            PATH="C:/sleef/bin;$PATH"
+            CFLAGS="/IC:/sleef/include /I{project}/numpy_quaddtype/QBLAS/include $CFLAGS"
+            CXXFLAGS="/IC:/sleef/include /I{project}/numpy_quaddtype/QBLAS/include $CXXFLAGS"
+            LDFLAGS="C:/sleef/lib/sleef.lib C:/sleef/lib/sleefquad.lib $LDFLAGS"
+          CIBW_REPAIR_WHEEL_COMMAND: 'delvewheel repair -w {dest_dir} {wheel} --add-path C:\sleef\bin'
+          CIBW_TEST_COMMAND: |
+            pip install {package}[test]
+            pytest -s {project}/tests
+          CIBW_TEST_EXTRAS: test
+          CIBW_TEST_FAIL_FAST: 1
+        shell: pwsh
+        run: |
+          python -m cibuildwheel --output-dir wheelhouse
+          if (-not (Test-Path wheelhouse/*.whl)) { throw "Wheel was not created" }
+        working-directory: ./quaddtype
 
-  #     - uses: actions/upload-artifact@v4
-  #       with:
-  #         path: ./quaddtype/wheelhouse/*.whl
-  #         name: wheels-windows-${{ matrix.architecture }}
+      - uses: actions/upload-artifact@v4
+        with:
+          path: ./quaddtype/wheelhouse/*.whl
+          name: wheels-windows-${{ matrix.architecture }}
 
   # publish_to_pypi:
   #   name: Publish to PyPI

From c5198d187356911bfea1f7cdab1bbff07115f5ea Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Fri, 11 Jul 2025 02:06:51 +0530
Subject: [PATCH 21/49] disabling qblas for windows; MSVC incompatibility

---
 .github/workflows/build_wheels.yml            |  51 +--
 .../src/quadblas_interface.cpp                | 375 +++++++++++++++++-
 2 files changed, 400 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 4e84dc8..22bf987 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -145,6 +145,7 @@ jobs:
           path: ./quaddtype/wheelhouse/*.whl
           name: wheels-${{ matrix.os }}
 
+  # disabling QBLAS optimization for windows due to incompatibility with MSVC
   build_wheels_windows:
     name: Build wheels on Windows
     runs-on: windows-latest
@@ -206,8 +207,8 @@ jobs:
             INCLUDE="C:/sleef/include;{project}/numpy_quaddtype/QBLAS/include;$INCLUDE"
             LIB="C:/sleef/lib;$LIB"
             PATH="C:/sleef/bin;$PATH"
-            CFLAGS="/IC:/sleef/include /I{project}/numpy_quaddtype/QBLAS/include $CFLAGS"
-            CXXFLAGS="/IC:/sleef/include /I{project}/numpy_quaddtype/QBLAS/include $CXXFLAGS"
+            CFLAGS="/IC:/sleef/include /I{project}/numpy_quaddtype/QBLAS/include /DDISABLE_QUADBLAS $CFLAGS"
+            CXXFLAGS="/IC:/sleef/include /I{project}/numpy_quaddtype/QBLAS/include /DDISABLE_QUADBLAS $CXXFLAGS"
             LDFLAGS="C:/sleef/lib/sleef.lib C:/sleef/lib/sleefquad.lib $LDFLAGS"
           CIBW_REPAIR_WHEEL_COMMAND: 'delvewheel repair -w {dest_dir} {wheel} --add-path C:\sleef\bin'
           CIBW_TEST_COMMAND: |
@@ -226,26 +227,26 @@ jobs:
           path: ./quaddtype/wheelhouse/*.whl
           name: wheels-windows-${{ matrix.architecture }}
 
-  # publish_to_pypi:
-  #   name: Publish to PyPI
-  #   needs: [build_wheels_linux, build_wheels_macos, build_wheels_windows]
-  #   runs-on: ubuntu-latest
-  #   if: startsWith(github.ref, 'refs/tags/quaddtype-v')
-
-  #   environment:
-  #     name: quadtype_release
-  #     url: https://pypi.org/p/numpy-quaddtype
-
-  #   permissions:
-  #     id-token: write  # IMPORTANT: mandatory for trusted publishing
-
-  #   steps:
-  #     - name: Download all workflow run artifacts
-  #       uses: actions/download-artifact@v4
-  #       with:
-  #         path: dist
-
-  #     - name: Publish to PyPI
-  #       uses: pypa/gh-action-pypi-publish@release/v1
-  #       with:
-  #         packages-dir: dist/*
+  publish_to_pypi:
+    name: Publish to PyPI
+    needs: [build_wheels_linux, build_wheels_macos, build_wheels_windows]
+    runs-on: ubuntu-latest
+    if: startsWith(github.ref, 'refs/tags/quaddtype-v')
+
+    environment:
+      name: quadtype_release
+      url: https://pypi.org/p/numpy-quaddtype
+
+    permissions:
+      id-token: write # IMPORTANT: mandatory for trusted publishing
+
+    steps:
+      - name: Download all workflow run artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: dist
+
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          packages-dir: dist/*
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index 555f854..6bc3fb0 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -17,7 +17,378 @@ extern "C" {
 #include "quad_common.h"
 #include "quadblas_interface.h"
 
+extern "C" {
+#include <sleef.h>
+#include <sleefquad.h>
+}
+
+#ifndef DISABLE_QUADBLAS
 #include "../QBLAS/include/quadblas/quadblas.hpp"
+#endif
+
+#ifdef DISABLE_QUADBLAS
+
+static bool
+extract_quad_array_info_simple(PyArrayObject *arr, Sleef_quad **data, QuadBackendType *backend)
+{
+    if (!PyArray_Check(arr)) {
+        PyErr_SetString(PyExc_TypeError, "Expected numpy array");
+        return false;
+    }
+
+    PyArray_Descr *descr = PyArray_DESCR(arr);
+    if (!PyObject_TypeCheck(descr, (PyTypeObject *)&QuadPrecDType)) {
+        PyErr_SetString(PyExc_TypeError, "Array must have QuadPrecDType dtype");
+        return false;
+    }
+
+    QuadPrecDTypeObject *quad_descr = (QuadPrecDTypeObject *)descr;
+    *backend = quad_descr->backend;
+    *data = (Sleef_quad *)PyArray_DATA(arr);
+
+    return true;
+}
+
+static Sleef_quad *
+ensure_sleef_backend_simple(PyArrayObject *arr, QuadBackendType original_backend,
+                            Sleef_quad **temp_storage)
+{
+    if (original_backend == BACKEND_SLEEF) {
+        *temp_storage = nullptr;
+        return (Sleef_quad *)PyArray_DATA(arr);
+    }
+
+    npy_intp size = PyArray_SIZE(arr);
+    *temp_storage = (Sleef_quad *)malloc(size * sizeof(Sleef_quad));
+    if (!*temp_storage) {
+        PyErr_NoMemory();
+        return nullptr;
+    }
+
+    long double *ld_data = (long double *)PyArray_DATA(arr);
+    for (npy_intp i = 0; i < size; i++) {
+        (*temp_storage)[i] = Sleef_cast_from_doubleq1((double)ld_data[i]);
+    }
+
+    return *temp_storage;
+}
+
+// ===============================================================================
+// FALLBACK IMPLEMENTATIONS (No QuadBLAS)
+// ===============================================================================
+
+static PyObject *
+dot_vector_vector_fallback(PyArrayObject *a, PyArrayObject *b)
+{
+    if (PyArray_NDIM(a) != 1 || PyArray_NDIM(b) != 1) {
+        PyErr_SetString(PyExc_ValueError, "Both inputs must be 1-dimensional arrays");
+        return nullptr;
+    }
+
+    npy_intp n_a = PyArray_DIM(a, 0);
+    npy_intp n_b = PyArray_DIM(b, 0);
+
+    if (n_a != n_b) {
+        PyErr_SetString(PyExc_ValueError, "Arrays must have the same length");
+        return nullptr;
+    }
+
+    Sleef_quad *data_a, *data_b;
+    QuadBackendType backend_a, backend_b;
+
+    if (!extract_quad_array_info_simple(a, &data_a, &backend_a) ||
+        !extract_quad_array_info_simple(b, &data_b, &backend_b)) {
+        return nullptr;
+    }
+
+    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
+    Sleef_quad *sleef_a = ensure_sleef_backend_simple(a, backend_a, &temp_a);
+    Sleef_quad *sleef_b = ensure_sleef_backend_simple(b, backend_b, &temp_b);
+
+    if (!sleef_a || !sleef_b) {
+        free(temp_a);
+        free(temp_b);
+        return nullptr;
+    }
+
+    // Simple dot product implementation
+    Sleef_quad result = Sleef_cast_from_doubleq1(0.0);
+    for (npy_intp i = 0; i < n_a; i++) {
+        result = Sleef_fmaq1_u05(sleef_a[i], sleef_b[i], result);
+    }
+
+    free(temp_a);
+    free(temp_b);
+
+    QuadBackendType result_backend = BACKEND_SLEEF;
+    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
+        result_backend = BACKEND_LONGDOUBLE;
+    }
+
+    QuadPrecisionObject *result_obj = QuadPrecision_raw_new(result_backend);
+    if (!result_obj) {
+        return nullptr;
+    }
+
+    if (result_backend == BACKEND_SLEEF) {
+        result_obj->value.sleef_value = result;
+    }
+    else {
+        result_obj->value.longdouble_value = (long double)Sleef_cast_to_doubleq1(result);
+    }
+
+    return (PyObject *)result_obj;
+}
+
+static PyObject *
+dot_matrix_vector_fallback(PyArrayObject *a, PyArrayObject *b)
+{
+    if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 1) {
+        PyErr_SetString(PyExc_ValueError, "First input must be 2D, second input must be 1D");
+        return nullptr;
+    }
+
+    npy_intp m = PyArray_DIM(a, 0);
+    npy_intp n = PyArray_DIM(a, 1);
+    npy_intp n_b = PyArray_DIM(b, 0);
+
+    if (n != n_b) {
+        PyErr_SetString(PyExc_ValueError, "Matrix columns must match vector length");
+        return nullptr;
+    }
+
+    Sleef_quad *data_a, *data_b;
+    QuadBackendType backend_a, backend_b;
+
+    if (!extract_quad_array_info_simple(a, &data_a, &backend_a) ||
+        !extract_quad_array_info_simple(b, &data_b, &backend_b)) {
+        return nullptr;
+    }
+
+    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
+    Sleef_quad *sleef_a = ensure_sleef_backend_simple(a, backend_a, &temp_a);
+    Sleef_quad *sleef_b = ensure_sleef_backend_simple(b, backend_b, &temp_b);
+
+    if (!sleef_a || !sleef_b) {
+        free(temp_a);
+        free(temp_b);
+        return nullptr;
+    }
+
+    QuadBackendType result_backend = BACKEND_SLEEF;
+    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
+        result_backend = BACKEND_LONGDOUBLE;
+    }
+
+    npy_intp result_dims[1] = {m};
+    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
+    if (!result_dtype) {
+        free(temp_a);
+        free(temp_b);
+        return nullptr;
+    }
+
+    PyArrayObject *result =
+            (PyArrayObject *)PyArray_Empty(1, result_dims, (PyArray_Descr *)result_dtype, 0);
+    if (!result) {
+        free(temp_a);
+        free(temp_b);
+        Py_DECREF(result_dtype);
+        return nullptr;
+    }
+
+    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
+
+    // Initialize result to zero
+    for (npy_intp i = 0; i < m; i++) {
+        result_data[i] = Sleef_cast_from_doubleq1(0.0);
+    }
+
+    // Simple matrix-vector multiplication: result[i] = sum(A[i,j] * b[j])
+    for (npy_intp i = 0; i < m; i++) {
+        Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);
+        for (npy_intp j = 0; j < n; j++) {
+            // Assume row-major layout: A[i,j] = sleef_a[i*n + j]
+            sum = Sleef_fmaq1_u05(sleef_a[i * n + j], sleef_b[j], sum);
+        }
+        result_data[i] = sum;
+    }
+
+    // Convert to longdouble if needed
+    if (result_backend == BACKEND_LONGDOUBLE) {
+        long double *ld_result = (long double *)PyArray_DATA(result);
+        for (npy_intp i = 0; i < m; i++) {
+            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+        }
+    }
+
+    free(temp_a);
+    free(temp_b);
+
+    return (PyObject *)result;
+}
+
+static PyObject *
+dot_matrix_matrix_fallback(PyArrayObject *a, PyArrayObject *b)
+{
+    if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 2) {
+        PyErr_SetString(PyExc_ValueError, "Both inputs must be 2-dimensional arrays");
+        return nullptr;
+    }
+
+    npy_intp m = PyArray_DIM(a, 0);
+    npy_intp k = PyArray_DIM(a, 1);
+    npy_intp k_b = PyArray_DIM(b, 0);
+    npy_intp n = PyArray_DIM(b, 1);
+
+    if (k != k_b) {
+        PyErr_SetString(PyExc_ValueError, "Matrix inner dimensions must match");
+        return nullptr;
+    }
+
+    Sleef_quad *data_a, *data_b;
+    QuadBackendType backend_a, backend_b;
+
+    if (!extract_quad_array_info_simple(a, &data_a, &backend_a) ||
+        !extract_quad_array_info_simple(b, &data_b, &backend_b)) {
+        return nullptr;
+    }
+
+    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
+    Sleef_quad *sleef_a = ensure_sleef_backend_simple(a, backend_a, &temp_a);
+    Sleef_quad *sleef_b = ensure_sleef_backend_simple(b, backend_b, &temp_b);
+
+    if (!sleef_a || !sleef_b) {
+        free(temp_a);
+        free(temp_b);
+        return nullptr;
+    }
+
+    QuadBackendType result_backend = BACKEND_SLEEF;
+    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
+        result_backend = BACKEND_LONGDOUBLE;
+    }
+
+    npy_intp result_dims[2] = {m, n};
+    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
+    if (!result_dtype) {
+        free(temp_a);
+        free(temp_b);
+        return nullptr;
+    }
+
+    PyArrayObject *result =
+            (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
+    if (!result) {
+        free(temp_a);
+        free(temp_b);
+        Py_DECREF(result_dtype);
+        return nullptr;
+    }
+
+    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
+
+    // Initialize result matrix to zero
+    for (npy_intp i = 0; i < m * n; i++) {
+        result_data[i] = Sleef_cast_from_doubleq1(0.0);
+    }
+
+    // Simple matrix-matrix multiplication: C[i,j] = sum(A[i,l] * B[l,j])
+    for (npy_intp i = 0; i < m; i++) {
+        for (npy_intp j = 0; j < n; j++) {
+            Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);
+            for (npy_intp l = 0; l < k; l++) {
+                // Row-major: A[i,l] = sleef_a[i*k + l], B[l,j] = sleef_b[l*n + j]
+                sum = Sleef_fmaq1_u05(sleef_a[i * k + l], sleef_b[l * n + j], sum);
+            }
+            result_data[i * n + j] = sum;
+        }
+    }
+
+    // Convert to longdouble if needed
+    if (result_backend == BACKEND_LONGDOUBLE) {
+        long double *ld_result = (long double *)PyArray_DATA(result);
+        for (npy_intp i = 0; i < m * n; i++) {
+            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+        }
+    }
+
+    free(temp_a);
+    free(temp_b);
+
+    return (PyObject *)result;
+}
+
+PyObject *
+py_quadblas_dot(PyObject *self, PyObject *args)
+{
+    PyObject *a_obj, *b_obj;
+
+    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+        return nullptr;
+    }
+
+    PyArrayObject *a = (PyArrayObject *)PyArray_FROM_OF(a_obj, NPY_ARRAY_ALIGNED);
+    PyArrayObject *b = (PyArrayObject *)PyArray_FROM_OF(b_obj, NPY_ARRAY_ALIGNED);
+
+    if (!a || !b) {
+        Py_XDECREF(a);
+        Py_XDECREF(b);
+        PyErr_SetString(PyExc_TypeError, "Inputs must be convertible to arrays");
+        return nullptr;
+    }
+
+    PyObject *result = nullptr;
+
+    int ndim_a = PyArray_NDIM(a);
+    int ndim_b = PyArray_NDIM(b);
+
+    if (ndim_a == 1 && ndim_b == 1) {
+        result = dot_vector_vector_fallback(a, b);
+    }
+    else if (ndim_a == 2 && ndim_b == 1) {
+        result = dot_matrix_vector_fallback(a, b);
+    }
+    else if (ndim_a == 2 && ndim_b == 2) {
+        result = dot_matrix_matrix_fallback(a, b);
+    }
+    else if (ndim_a == 1 && ndim_b == 2) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Vector-Matrix multiplication not supported (use Matrix-Vector instead)");
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError,
+                        "Unsupported array dimensions. Supported: (1D,1D), (2D,1D), (2D,2D)");
+    }
+
+    Py_DECREF(a);
+    Py_DECREF(b);
+
+    return result;
+}
+
+// Dummy implementations for other QuadBLAS functions
+PyObject *
+py_quadblas_set_num_threads(PyObject *self, PyObject *args)
+{
+    // On Windows fallback, just ignore thread setting
+    Py_RETURN_NONE;
+}
+
+PyObject *
+py_quadblas_get_num_threads(PyObject *self, PyObject *args)
+{
+    // Return 1 for fallback implementation
+    return PyLong_FromLong(1);
+}
+
+PyObject *
+py_quadblas_get_version(PyObject *self, PyObject *args)
+{
+    return PyUnicode_FromString("QuadBLAS is disabled for MSVC");
+}
+
+#else
 
 static QuadBLAS::Layout
 get_quadblas_layout(PyArrayObject *arr)
@@ -413,4 +784,6 @@ PyObject *
 py_quadblas_get_version(PyObject *self, PyObject *args)
 {
     return PyUnicode_FromString(QuadBLAS::VERSION);
-}
\ No newline at end of file
+}
+
+#endif  // DISABLE_QUADBLAS
\ No newline at end of file

From c0d93f8a0c2bdc6fd13931ab60160d0b8c967183 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Fri, 11 Jul 2025 10:22:27 +0530
Subject: [PATCH 22/49] updating CI triggering paths

---
 .github/workflows/build_wheels.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 22bf987..c2b3e4b 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -6,11 +6,13 @@ on:
       - dot
     tags:
       - "quaddtype-v*"
-    # paths:
-    #   - "quaddtype/**"
+    paths:
+      - "quaddtype/**"
+      - ".github/workflows"
   pull_request:
-    # paths:
-    #   - "quaddtype/**"
+    paths:
+      - "quaddtype/**"
+      - ".github/workflows"
   workflow_dispatch:
 
 jobs:

From 838adee5d70fbc7a0075a98f8aa7b8358f0c58f5 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Fri, 11 Jul 2025 10:23:59 +0530
Subject: [PATCH 23/49] updating CI triggering paths

---
 .github/workflows/build_wheels.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index c2b3e4b..0bf55c4 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -8,11 +8,11 @@ on:
       - "quaddtype-v*"
     paths:
       - "quaddtype/**"
-      - ".github/workflows"
+      - ".github/workflows/**"
   pull_request:
     paths:
       - "quaddtype/**"
-      - ".github/workflows"
+      - ".github/workflows/**"
   workflow_dispatch:
 
 jobs:

From 433aa9012bfb3f85590de594b6f049f5e3f99db4 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Fri, 11 Jul 2025 10:37:54 +0530
Subject: [PATCH 24/49] reverting branch to main

---
 .github/workflows/build_wheels.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 0bf55c4..37a8038 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -3,7 +3,7 @@ name: Build Wheels
 on:
   push:
     branches:
-      - dot
+      - main
     tags:
       - "quaddtype-v*"
     paths:

From 5836505d861f95677e7c8bc6d3731e7e63fe1876 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Fri, 11 Jul 2025 07:42:04 +0000
Subject: [PATCH 25/49] bumping qblas

---
 quaddtype/numpy_quaddtype/QBLAS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS
index 4853ac1..0eabb67 160000
--- a/quaddtype/numpy_quaddtype/QBLAS
+++ b/quaddtype/numpy_quaddtype/QBLAS
@@ -1 +1 @@
-Subproject commit 4853ac1c7d3fa3016b61e9f2b9a43f49c06d891d
+Subproject commit 0eabb677431c6148434c50deba7abd6902d74b16

From 33b48fefb2ee2795e4b8b598a864480f5028cfdb Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 16 Jul 2025 12:04:46 +0000
Subject: [PATCH 26/49] umath refactor

---
 quaddtype/meson.build                         |  13 +-
 .../numpy_quaddtype/src/quaddtype_main.c      |   2 +-
 quaddtype/numpy_quaddtype/src/umath.cpp       | 806 ------------------
 .../numpy_quaddtype/src/umath/binary_ops.cpp  | 235 +++++
 .../numpy_quaddtype/src/umath/binary_ops.h    |   9 +
 .../src/umath/comparison_ops.cpp              | 240 ++++++
 .../src/umath/comparison_ops.h                |   9 +
 .../numpy_quaddtype/src/umath/promoters.hpp   |  90 ++
 quaddtype/numpy_quaddtype/src/umath/umath.cpp | 115 +++
 .../numpy_quaddtype/src/{ => umath}/umath.h   |   2 +-
 .../numpy_quaddtype/src/umath/unary_ops.cpp   | 214 +++++
 .../numpy_quaddtype/src/umath/unary_ops.h     |   9 +
 12 files changed, 933 insertions(+), 811 deletions(-)
 delete mode 100644 quaddtype/numpy_quaddtype/src/umath.cpp
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/binary_ops.cpp
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/binary_ops.h
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/comparison_ops.cpp
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/comparison_ops.h
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/promoters.hpp
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/umath.cpp
 rename quaddtype/numpy_quaddtype/src/{ => umath}/umath.h (95%)
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/unary_ops.cpp
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/unary_ops.h

diff --git a/quaddtype/meson.build b/quaddtype/meson.build
index d1c6799..66318d6 100644
--- a/quaddtype/meson.build
+++ b/quaddtype/meson.build
@@ -50,12 +50,19 @@ srcs = [
     'numpy_quaddtype/src/scalar_ops.h',
     'numpy_quaddtype/src/scalar_ops.cpp',
     'numpy_quaddtype/src/ops.hpp',
-    'numpy_quaddtype/src/umath.h',
-    'numpy_quaddtype/src/umath.cpp',
     'numpy_quaddtype/src/dragon4.h',
     'numpy_quaddtype/src/dragon4.c',
     'numpy_quaddtype/src/quadblas_interface.h',
-    'numpy_quaddtype/src/quadblas_interface.cpp'
+    'numpy_quaddtype/src/quadblas_interface.cpp',
+    'numpy_quaddtype/src/umath/umath.h',
+    'numpy_quaddtype/src/umath/umath.cpp',
+    'numpy_quaddtype/src/umath/binary_ops.h',
+    'numpy_quaddtype/src/umath/binary_ops.cpp',
+    'numpy_quaddtype/src/umath/unary_ops.h',
+    'numpy_quaddtype/src/umath/unary_ops.cpp',
+    'numpy_quaddtype/src/umath/comparison_ops.h',
+    'numpy_quaddtype/src/umath/comparison_ops.cpp',
+    'numpy_quaddtype/src/umath/promoters.hpp',
 ]
 
 py.install_sources(
diff --git a/quaddtype/numpy_quaddtype/src/quaddtype_main.c b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
index 9e2b843..641200d 100644
--- a/quaddtype/numpy_quaddtype/src/quaddtype_main.c
+++ b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
@@ -14,7 +14,7 @@
 
 #include "scalar.h"
 #include "dtype.h"
-#include "umath.h"
+#include "umath/umath.h"
 #include "quad_common.h"
 #include "quadblas_interface.h"
 #include "float.h"
diff --git a/quaddtype/numpy_quaddtype/src/umath.cpp b/quaddtype/numpy_quaddtype/src/umath.cpp
deleted file mode 100644
index 0058236..0000000
--- a/quaddtype/numpy_quaddtype/src/umath.cpp
+++ /dev/null
@@ -1,806 +0,0 @@
-#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
-#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
-#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
-#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
-#define NO_IMPORT_ARRAY
-#define NO_IMPORT_UFUNC
-
-extern "C" {
-#include <Python.h>
-#include <cstdio>
-
-#include "numpy/arrayobject.h"
-#include "numpy/ndarraytypes.h"
-#include "numpy/ufuncobject.h"
-
-#include "numpy/dtype_api.h"
-}
-#include "quad_common.h"
-#include "scalar.h"
-#include "dtype.h"
-#include "umath.h"
-#include "ops.hpp"
-
-// helper debugging function
-static const char *
-get_dtype_name(PyArray_DTypeMeta *dtype)
-{
-    if (dtype == &QuadPrecDType) {
-        return "QuadPrecDType";
-    }
-    else if (dtype == &PyArray_BoolDType) {
-        return "BoolDType";
-    }
-    else if (dtype == &PyArray_ByteDType) {
-        return "ByteDType";
-    }
-    else if (dtype == &PyArray_UByteDType) {
-        return "UByteDType";
-    }
-    else if (dtype == &PyArray_ShortDType) {
-        return "ShortDType";
-    }
-    else if (dtype == &PyArray_UShortDType) {
-        return "UShortDType";
-    }
-    else if (dtype == &PyArray_IntDType) {
-        return "IntDType";
-    }
-    else if (dtype == &PyArray_UIntDType) {
-        return "UIntDType";
-    }
-    else if (dtype == &PyArray_LongDType) {
-        return "LongDType";
-    }
-    else if (dtype == &PyArray_ULongDType) {
-        return "ULongDType";
-    }
-    else if (dtype == &PyArray_LongLongDType) {
-        return "LongLongDType";
-    }
-    else if (dtype == &PyArray_ULongLongDType) {
-        return "ULongLongDType";
-    }
-    else if (dtype == &PyArray_FloatDType) {
-        return "FloatDType";
-    }
-    else if (dtype == &PyArray_DoubleDType) {
-        return "DoubleDType";
-    }
-    else if (dtype == &PyArray_LongDoubleDType) {
-        return "LongDoubleDType";
-    }
-    else {
-        return "UnknownDType";
-    }
-}
-
-static NPY_CASTING
-quad_unary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
-                                  PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
-                                  npy_intp *NPY_UNUSED(view_offset))
-{
-    Py_INCREF(given_descrs[0]);
-    loop_descrs[0] = given_descrs[0];
-
-    if (given_descrs[1] == NULL) {
-        Py_INCREF(given_descrs[0]);
-        loop_descrs[1] = given_descrs[0];
-    }
-    else {
-        Py_INCREF(given_descrs[1]);
-        loop_descrs[1] = given_descrs[1];
-    }
-
-    QuadPrecDTypeObject *descr_in = (QuadPrecDTypeObject *)given_descrs[0];
-    QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)loop_descrs[1];
-
-    if (descr_in->backend != descr_out->backend) {
-        return NPY_UNSAFE_CASTING;
-    }
-
-    return NPY_NO_CASTING;
-}
-
-template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
-int
-quad_generic_unary_op_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
-                                             npy_intp const dimensions[], npy_intp const strides[],
-                                             NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in_ptr = data[0];
-    char *out_ptr = data[1];
-    npy_intp in_stride = strides[0];
-    npy_intp out_stride = strides[1];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
-
-    quad_value in, out;
-    while (N--) {
-        memcpy(&in, in_ptr, elem_size);
-        if (backend == BACKEND_SLEEF) {
-            out.sleef_value = sleef_op(&in.sleef_value);
-        }
-        else {
-            out.longdouble_value = longdouble_op(&in.longdouble_value);
-        }
-        memcpy(out_ptr, &out, elem_size);
-
-        in_ptr += in_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
-int
-quad_generic_unary_op_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
-                                           npy_intp const dimensions[], npy_intp const strides[],
-                                           NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in_ptr = data[0];
-    char *out_ptr = data[1];
-    npy_intp in_stride = strides[0];
-    npy_intp out_stride = strides[1];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-
-    while (N--) {
-        if (backend == BACKEND_SLEEF) {
-            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in_ptr);
-        }
-        else {
-            *(long double *)out_ptr = longdouble_op((long double *)in_ptr);
-        }
-        in_ptr += in_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
-int
-create_quad_unary_ufunc(PyObject *numpy, const char *ufunc_name)
-{
-    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
-    if (ufunc == NULL) {
-        return -1;
-    }
-
-    PyArray_DTypeMeta *dtypes[2] = {&QuadPrecDType, &QuadPrecDType};
-
-    PyType_Slot slots[] = {
-            {NPY_METH_resolve_descriptors, (void *)&quad_unary_op_resolve_descriptors},
-            {NPY_METH_strided_loop,
-             (void *)&quad_generic_unary_op_strided_loop_aligned<sleef_op, longdouble_op>},
-            {NPY_METH_unaligned_strided_loop,
-             (void *)&quad_generic_unary_op_strided_loop_unaligned<sleef_op, longdouble_op>},
-            {0, NULL}};
-
-    PyArrayMethod_Spec Spec = {
-            .name = "quad_unary_op",
-            .nin = 1,
-            .nout = 1,
-            .casting = NPY_NO_CASTING,
-            .flags = NPY_METH_SUPPORTS_UNALIGNED,
-            .dtypes = dtypes,
-            .slots = slots,
-    };
-
-    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
-        return -1;
-    }
-
-    return 0;
-}
-
-int
-init_quad_unary_ops(PyObject *numpy)
-{
-    if (create_quad_unary_ufunc<quad_negative, ld_negative>(numpy, "negative") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_positive, ld_positive>(numpy, "positive") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_absolute, ld_absolute>(numpy, "absolute") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_rint, ld_rint>(numpy, "rint") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_trunc, ld_trunc>(numpy, "trunc") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_floor, ld_floor>(numpy, "floor") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_ceil, ld_ceil>(numpy, "ceil") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_sqrt, ld_sqrt>(numpy, "sqrt") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_square, ld_square>(numpy, "square") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_log, ld_log>(numpy, "log") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_log2, ld_log2>(numpy, "log2") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_log10, ld_log10>(numpy, "log10") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_log1p, ld_log1p>(numpy, "log1p") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_exp, ld_exp>(numpy, "exp") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_exp2, ld_exp2>(numpy, "exp2") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_sin, ld_sin>(numpy, "sin") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_cos, ld_cos>(numpy, "cos") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_tan, ld_tan>(numpy, "tan") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_asin, ld_asin>(numpy, "arcsin") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_acos, ld_acos>(numpy, "arccos") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_atan, ld_atan>(numpy, "arctan") < 0) {
-        return -1;
-    }
-    return 0;
-}
-
-// Binary ufuncs
-
-static NPY_CASTING
-quad_binary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
-                                   PyArray_Descr *const given_descrs[],
-                                   PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset))
-{
-    QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
-    QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
-    QuadBackendType target_backend;
-
-    // Determine target backend and if casting is needed
-    NPY_CASTING casting = NPY_NO_CASTING;
-    if (descr_in1->backend != descr_in2->backend) {
-        target_backend = BACKEND_LONGDOUBLE;
-        casting = NPY_SAFE_CASTING;
-    }
-    else {
-        target_backend = descr_in1->backend;
-    }
-
-    // Set up input descriptors, casting if necessary
-    for (int i = 0; i < 2; i++) {
-        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
-            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
-            if (!loop_descrs[i]) {
-                return (NPY_CASTING)-1;
-            }
-        }
-        else {
-            Py_INCREF(given_descrs[i]);
-            loop_descrs[i] = given_descrs[i];
-        }
-    }
-
-    // Set up output descriptor
-    if (given_descrs[2] == NULL) {
-        loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
-        if (!loop_descrs[2]) {
-            return (NPY_CASTING)-1;
-        }
-    }
-    else {
-        QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2];
-        if (descr_out->backend != target_backend) {
-            loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
-            if (!loop_descrs[2]) {
-                return (NPY_CASTING)-1;
-            }
-        }
-        else {
-            Py_INCREF(given_descrs[2]);
-            loop_descrs[2] = given_descrs[2];
-        }
-    }
-    return casting;
-}
-
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
-int
-quad_generic_binop_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
-                                          npy_intp const dimensions[], npy_intp const strides[],
-                                          NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
-
-    quad_value in1, in2, out;
-    while (N--) {
-        memcpy(&in1, in1_ptr, elem_size);
-        memcpy(&in2, in2_ptr, elem_size);
-        if (backend == BACKEND_SLEEF) {
-            out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value);
-        }
-        else {
-            out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value);
-        }
-        memcpy(out_ptr, &out, elem_size);
-
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
-int
-quad_generic_binop_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
-                                        npy_intp const dimensions[], npy_intp const strides[],
-                                        NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-
-    while (N--) {
-        if (backend == BACKEND_SLEEF) {
-            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr);
-        }
-        else {
-            *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr);
-        }
-
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-static int
-quad_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[],
-                    PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[])
-{
-    int nin = ufunc->nin;
-    int nargs = ufunc->nargs;
-    PyArray_DTypeMeta *common = NULL;
-    bool has_quad = false;
-
-    // Handle the special case for reductions
-    if (op_dtypes[0] == NULL) {
-        assert(nin == 2 && ufunc->nout == 1); /* must be reduction */
-        for (int i = 0; i < 3; i++) {
-            Py_INCREF(op_dtypes[1]);
-            new_op_dtypes[i] = op_dtypes[1];
-        }
-        return 0;
-    }
-
-    // Check if any input or signature is QuadPrecision
-    for (int i = 0; i < nin; i++) {
-        if (op_dtypes[i] == &QuadPrecDType) {
-            has_quad = true;
-        }
-    }
-
-    if (has_quad) {
-        common = &QuadPrecDType;
-    }
-    else {
-        for (int i = nin; i < nargs; i++) {
-            if (signature[i] != NULL) {
-                if (common == NULL) {
-                    Py_INCREF(signature[i]);
-                    common = signature[i];
-                }
-                else if (common != signature[i]) {
-                    Py_CLEAR(common);  // Not homogeneous, unset common
-                    break;
-                }
-            }
-        }
-    }
-    // If no common output dtype, use standard promotion for inputs
-    if (common == NULL) {
-        common = PyArray_PromoteDTypeSequence(nin, op_dtypes);
-        if (common == NULL) {
-            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
-                PyErr_Clear();  // Do not propagate normal promotion errors
-            }
-
-            return -1;
-        }
-    }
-
-    // Set all new_op_dtypes to the common dtype
-    for (int i = 0; i < nargs; i++) {
-        if (signature[i]) {
-            // If signature is specified for this argument, use it
-            Py_INCREF(signature[i]);
-            new_op_dtypes[i] = signature[i];
-        }
-        else {
-            // Otherwise, use the common dtype
-            Py_INCREF(common);
-
-            new_op_dtypes[i] = common;
-        }
-    }
-
-    Py_XDECREF(common);
-
-    return 0;
-}
-
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
-int
-create_quad_binary_ufunc(PyObject *numpy, const char *ufunc_name)
-{
-    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
-    if (ufunc == NULL) {
-        return -1;
-    }
-
-    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
-
-    PyType_Slot slots[] = {
-            {NPY_METH_resolve_descriptors, (void *)&quad_binary_op_resolve_descriptors},
-            {NPY_METH_strided_loop,
-             (void *)&quad_generic_binop_strided_loop_aligned<sleef_op, longdouble_op>},
-            {NPY_METH_unaligned_strided_loop,
-             (void *)&quad_generic_binop_strided_loop_unaligned<sleef_op, longdouble_op>},
-            {0, NULL}};
-
-    PyArrayMethod_Spec Spec = {
-            .name = "quad_binop",
-            .nin = 2,
-            .nout = 1,
-            .casting = NPY_NO_CASTING,
-            .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE),
-            .dtypes = dtypes,
-            .slots = slots,
-    };
-
-    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
-        return -1;
-    }
-
-    PyObject *promoter_capsule =
-            PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL);
-    if (promoter_capsule == NULL) {
-        return -1;
-    }
-
-    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type);
-    if (DTypes == 0) {
-        Py_DECREF(promoter_capsule);
-        return -1;
-    }
-
-    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
-        Py_DECREF(promoter_capsule);
-        Py_DECREF(DTypes);
-        return -1;
-    }
-    Py_DECREF(promoter_capsule);
-    Py_DECREF(DTypes);
-    return 0;
-}
-
-int
-init_quad_binary_ops(PyObject *numpy)
-{
-    if (create_quad_binary_ufunc<quad_add, ld_add>(numpy, "add") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_sub, ld_sub>(numpy, "subtract") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_mul, ld_mul>(numpy, "multiply") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_div, ld_div>(numpy, "divide") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_pow, ld_pow>(numpy, "power") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_mod, ld_mod>(numpy, "mod") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_minimum, ld_minimum>(numpy, "minimum") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_maximum, ld_maximum>(numpy, "maximum") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_atan2, ld_atan2>(numpy, "arctan2") < 0) {
-        return -1;
-    }
-    return 0;
-}
-
-// comparison functions
-
-static NPY_CASTING
-quad_comparison_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
-                                       PyArray_Descr *const given_descrs[],
-                                       PyArray_Descr *loop_descrs[],
-                                       npy_intp *NPY_UNUSED(view_offset))
-{
-    QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
-    QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
-    QuadBackendType target_backend;
-
-    // As dealing with different backends then cast to boolean
-    NPY_CASTING casting = NPY_NO_CASTING;
-    if (descr_in1->backend != descr_in2->backend) {
-        target_backend = BACKEND_LONGDOUBLE;
-        casting = NPY_SAFE_CASTING;
-    }
-    else {
-        target_backend = descr_in1->backend;
-    }
-
-    // Set up input descriptors, casting if necessary
-    for (int i = 0; i < 2; i++) {
-        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
-            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
-            if (!loop_descrs[i]) {
-                return (NPY_CASTING)-1;
-            }
-        }
-        else {
-            Py_INCREF(given_descrs[i]);
-            loop_descrs[i] = given_descrs[i];
-        }
-    }
-
-    // Set up output descriptor
-    loop_descrs[2] = PyArray_DescrFromType(NPY_BOOL);
-    if (!loop_descrs[2]) {
-        return (NPY_CASTING)-1;
-    }
-    return casting;
-}
-
-template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
-int
-quad_generic_comp_strided_loop(PyArrayMethod_Context *context, char *const data[],
-                               npy_intp const dimensions[], npy_intp const strides[],
-                               NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
-
-    quad_value in1, in2;
-    while (N--) {
-        memcpy(&in1, in1_ptr, elem_size);
-        memcpy(&in2, in2_ptr, elem_size);
-        npy_bool result;
-
-        if (backend == BACKEND_SLEEF) {
-            result = sleef_comp(&in1.sleef_value, &in2.sleef_value);
-        }
-        else {
-            result = ld_comp(&in1.longdouble_value, &in2.longdouble_value);
-        }
-
-        memcpy(out_ptr, &result, sizeof(npy_bool));
-
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
-int
-quad_generic_comp_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
-                                       npy_intp const dimensions[], npy_intp const strides[],
-                                       NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-    while (N--) {
-        quad_value in1 = *(quad_value *)in1_ptr;
-        quad_value in2 = *(quad_value *)in2_ptr;
-
-        npy_bool result;
-
-        if (backend == BACKEND_SLEEF) {
-            result = sleef_comp(&in1.sleef_value, &in2.sleef_value);
-        }
-        else {
-            result = ld_comp(&in1.longdouble_value, &in2.longdouble_value);
-        }
-
-        *(npy_bool *)out_ptr = result;
-
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-NPY_NO_EXPORT int
-comparison_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[],
-                          PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[])
-{
-    PyArray_DTypeMeta *new_signature[NPY_MAXARGS];
-    memcpy(new_signature, signature, 3 * sizeof(PyArray_DTypeMeta *));
-    new_signature[2] = NULL;
-    int res = quad_ufunc_promoter(ufunc, op_dtypes, new_signature, new_op_dtypes);
-    if (res < 0) {
-        return -1;
-    }
-    Py_XSETREF(new_op_dtypes[2], &PyArray_BoolDType);
-    return 0;
-}
-
-template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
-int
-create_quad_comparison_ufunc(PyObject *numpy, const char *ufunc_name)
-{
-    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
-    if (ufunc == NULL) {
-        return -1;
-    }
-
-    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &PyArray_BoolDType};
-
-    PyType_Slot slots[] = {
-            {NPY_METH_resolve_descriptors, (void *)&quad_comparison_op_resolve_descriptors},
-            {NPY_METH_strided_loop,
-             (void *)&quad_generic_comp_strided_loop_aligned<sleef_comp, ld_comp>},
-            {NPY_METH_unaligned_strided_loop,
-             (void *)&quad_generic_comp_strided_loop<sleef_comp, ld_comp>},
-            {0, NULL}};
-
-    PyArrayMethod_Spec Spec = {
-            .name = "quad_comp",
-            .nin = 2,
-            .nout = 1,
-            .casting = NPY_SAFE_CASTING,
-            .flags = NPY_METH_SUPPORTS_UNALIGNED,
-            .dtypes = dtypes,
-            .slots = slots,
-    };
-
-    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
-        return -1;
-    }
-
-    PyObject *promoter_capsule =
-            PyCapsule_New((void *)&comparison_ufunc_promoter, "numpy._ufunc_promoter", NULL);
-    if (promoter_capsule == NULL) {
-        return -1;
-    }
-
-    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArray_BoolDType);
-    if (DTypes == 0) {
-        Py_DECREF(promoter_capsule);
-        return -1;
-    }
-
-    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
-        Py_DECREF(promoter_capsule);
-        Py_DECREF(DTypes);
-        return -1;
-    }
-    Py_DECREF(promoter_capsule);
-    Py_DECREF(DTypes);
-
-    return 0;
-}
-
-int
-init_quad_comps(PyObject *numpy)
-{
-    if (create_quad_comparison_ufunc<quad_equal, ld_equal>(numpy, "equal") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_notequal, ld_notequal>(numpy, "not_equal") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_less, ld_less>(numpy, "less") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_lessequal, ld_lessequal>(numpy, "less_equal") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_greater, ld_greater>(numpy, "greater") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_greaterequal, ld_greaterequal>(numpy, "greater_equal") <
-        0) {
-        return -1;
-    }
-
-    return 0;
-}
-
-int
-init_quad_umath(void)
-{
-    PyObject *numpy = PyImport_ImportModule("numpy");
-    if (!numpy) {
-        PyErr_SetString(PyExc_ImportError, "Failed to import numpy module");
-        return -1;
-    }
-
-    if (init_quad_unary_ops(numpy) < 0) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad unary operations");
-        goto err;
-    }
-
-    if (init_quad_binary_ops(numpy) < 0) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad binary operations");
-        goto err;
-    }
-
-    if (init_quad_comps(numpy) < 0) {
-        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad comparison operations");
-        goto err;
-    }
-
-    Py_DECREF(numpy);
-    return 0;
-
-err:
-    Py_DECREF(numpy);
-    return -1;
-}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/binary_ops.cpp b/quaddtype/numpy_quaddtype/src/umath/binary_ops.cpp
new file mode 100644
index 0000000..aa6d19c
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/binary_ops.cpp
@@ -0,0 +1,235 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+
+#include <Python.h>
+#include <cstdio>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/dtype_api.h"
+#include "numpy/ndarraytypes.h"
+
+#include "../quad_common.h"
+#include "../scalar.h"
+#include "../dtype.h"
+#include "../ops.hpp"
+#include "promoters.hpp"
+#include "binary_ops.h"
+
+static NPY_CASTING
+quad_binary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
+                                   PyArray_Descr *const given_descrs[],
+                                   PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset))
+{
+    QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
+    QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
+    QuadBackendType target_backend;
+
+    // Determine target backend and if casting is needed
+    NPY_CASTING casting = NPY_NO_CASTING;
+    if (descr_in1->backend != descr_in2->backend) {
+        target_backend = BACKEND_LONGDOUBLE;
+        casting = NPY_SAFE_CASTING;
+    }
+    else {
+        target_backend = descr_in1->backend;
+    }
+
+    // Set up input descriptors, casting if necessary
+    for (int i = 0; i < 2; i++) {
+        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
+            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[i]) {
+                return (NPY_CASTING)-1;
+            }
+        }
+        else {
+            Py_INCREF(given_descrs[i]);
+            loop_descrs[i] = given_descrs[i];
+        }
+    }
+
+    // Set up output descriptor
+    if (given_descrs[2] == NULL) {
+        loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+        if (!loop_descrs[2]) {
+            return (NPY_CASTING)-1;
+        }
+    }
+    else {
+        QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2];
+        if (descr_out->backend != target_backend) {
+            loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[2]) {
+                return (NPY_CASTING)-1;
+            }
+        }
+        else {
+            Py_INCREF(given_descrs[2]);
+            loop_descrs[2] = given_descrs[2];
+        }
+    }
+    return casting;
+}
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+int
+quad_generic_binop_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                          npy_intp const dimensions[], npy_intp const strides[],
+                                          NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    quad_value in1, in2, out;
+    while (N--) {
+        memcpy(&in1, in1_ptr, elem_size);
+        memcpy(&in2, in2_ptr, elem_size);
+        if (backend == BACKEND_SLEEF) {
+            out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value);
+        }
+        else {
+            out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value);
+        }
+        memcpy(out_ptr, &out, elem_size);
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+int
+quad_generic_binop_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                        npy_intp const dimensions[], npy_intp const strides[],
+                                        NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+
+    while (N--) {
+        if (backend == BACKEND_SLEEF) {
+            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr);
+        }
+        else {
+            *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr);
+        }
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+int
+create_quad_binary_ufunc(PyObject *numpy, const char *ufunc_name)
+{
+    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+
+    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
+
+    PyType_Slot slots[] = {
+            {NPY_METH_resolve_descriptors, (void *)&quad_binary_op_resolve_descriptors},
+            {NPY_METH_strided_loop,
+             (void *)&quad_generic_binop_strided_loop_aligned<sleef_op, longdouble_op>},
+            {NPY_METH_unaligned_strided_loop,
+             (void *)&quad_generic_binop_strided_loop_unaligned<sleef_op, longdouble_op>},
+            {0, NULL}};
+
+    PyArrayMethod_Spec Spec = {
+            .name = "quad_binop",
+            .nin = 2,
+            .nout = 1,
+            .casting = NPY_NO_CASTING,
+            .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE),
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        return -1;
+    }
+
+    PyObject *promoter_capsule =
+            PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL);
+    if (promoter_capsule == NULL) {
+        return -1;
+    }
+
+    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type);
+    if (DTypes == 0) {
+        Py_DECREF(promoter_capsule);
+        return -1;
+    }
+
+    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
+        Py_DECREF(promoter_capsule);
+        Py_DECREF(DTypes);
+        return -1;
+    }
+    Py_DECREF(promoter_capsule);
+    Py_DECREF(DTypes);
+    return 0;
+}
+
+
+int
+init_quad_binary_ops(PyObject *numpy)
+{
+    if (create_quad_binary_ufunc<quad_add, ld_add>(numpy, "add") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_sub, ld_sub>(numpy, "subtract") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_mul, ld_mul>(numpy, "multiply") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_div, ld_div>(numpy, "divide") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_pow, ld_pow>(numpy, "power") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_mod, ld_mod>(numpy, "mod") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_minimum, ld_minimum>(numpy, "minimum") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_maximum, ld_maximum>(numpy, "maximum") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_atan2, ld_atan2>(numpy, "arctan2") < 0) {
+        return -1;
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/binary_ops.h b/quaddtype/numpy_quaddtype/src/umath/binary_ops.h
new file mode 100644
index 0000000..c45ec45
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/binary_ops.h
@@ -0,0 +1,9 @@
+#ifndef _QUADDTYPE_BINARY_OPS_H
+#define _QUADDTYPE_BINARY_OPS_H
+
+#include <Python.h>
+
+int
+init_quad_binary_ops(PyObject *numpy);
+
+#endif
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/comparison_ops.cpp b/quaddtype/numpy_quaddtype/src/umath/comparison_ops.cpp
new file mode 100644
index 0000000..095b6d3
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/comparison_ops.cpp
@@ -0,0 +1,240 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+
+#include <Python.h>
+#include <cstdio>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/dtype_api.h"
+#include "numpy/ndarraytypes.h"
+
+#include "../quad_common.h"
+#include "../scalar.h"
+#include "../dtype.h"
+#include "umath.h"
+#include "../ops.hpp"
+#include "promoters.hpp"
+#include "binary_ops.h"
+#include "comparison_ops.h"
+
+
+static NPY_CASTING
+quad_comparison_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
+                                       PyArray_Descr *const given_descrs[],
+                                       PyArray_Descr *loop_descrs[],
+                                       npy_intp *NPY_UNUSED(view_offset))
+{
+    QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
+    QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
+    QuadBackendType target_backend;
+
+    // As dealing with different backends then cast to boolean
+    NPY_CASTING casting = NPY_NO_CASTING;
+    if (descr_in1->backend != descr_in2->backend) {
+        target_backend = BACKEND_LONGDOUBLE;
+        casting = NPY_SAFE_CASTING;
+    }
+    else {
+        target_backend = descr_in1->backend;
+    }
+
+    // Set up input descriptors, casting if necessary
+    for (int i = 0; i < 2; i++) {
+        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
+            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[i]) {
+                return (NPY_CASTING)-1;
+            }
+        }
+        else {
+            Py_INCREF(given_descrs[i]);
+            loop_descrs[i] = given_descrs[i];
+        }
+    }
+
+    // Set up output descriptor
+    loop_descrs[2] = PyArray_DescrFromType(NPY_BOOL);
+    if (!loop_descrs[2]) {
+        return (NPY_CASTING)-1;
+    }
+    return casting;
+}
+
+template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
+int
+quad_generic_comp_strided_loop(PyArrayMethod_Context *context, char *const data[],
+                               npy_intp const dimensions[], npy_intp const strides[],
+                               NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    quad_value in1, in2;
+    while (N--) {
+        memcpy(&in1, in1_ptr, elem_size);
+        memcpy(&in2, in2_ptr, elem_size);
+        npy_bool result;
+
+        if (backend == BACKEND_SLEEF) {
+            result = sleef_comp(&in1.sleef_value, &in2.sleef_value);
+        }
+        else {
+            result = ld_comp(&in1.longdouble_value, &in2.longdouble_value);
+        }
+
+        memcpy(out_ptr, &result, sizeof(npy_bool));
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
+int
+quad_generic_comp_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                       npy_intp const dimensions[], npy_intp const strides[],
+                                       NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    while (N--) {
+        quad_value in1 = *(quad_value *)in1_ptr;
+        quad_value in2 = *(quad_value *)in2_ptr;
+
+        npy_bool result;
+
+        if (backend == BACKEND_SLEEF) {
+            result = sleef_comp(&in1.sleef_value, &in2.sleef_value);
+        }
+        else {
+            result = ld_comp(&in1.longdouble_value, &in2.longdouble_value);
+        }
+
+        *(npy_bool *)out_ptr = result;
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+NPY_NO_EXPORT int
+comparison_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[],
+                          PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[])
+{
+    PyArray_DTypeMeta *new_signature[NPY_MAXARGS];
+    memcpy(new_signature, signature, 3 * sizeof(PyArray_DTypeMeta *));
+    new_signature[2] = NULL;
+    int res = quad_ufunc_promoter(ufunc, op_dtypes, new_signature, new_op_dtypes);
+    if (res < 0) {
+        return -1;
+    }
+    Py_XSETREF(new_op_dtypes[2], &PyArray_BoolDType);
+    return 0;
+}
+
+template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
+int
+create_quad_comparison_ufunc(PyObject *numpy, const char *ufunc_name)
+{
+    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+
+    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &PyArray_BoolDType};
+
+    PyType_Slot slots[] = {
+            {NPY_METH_resolve_descriptors, (void *)&quad_comparison_op_resolve_descriptors},
+            {NPY_METH_strided_loop,
+             (void *)&quad_generic_comp_strided_loop_aligned<sleef_comp, ld_comp>},
+            {NPY_METH_unaligned_strided_loop,
+             (void *)&quad_generic_comp_strided_loop<sleef_comp, ld_comp>},
+            {0, NULL}};
+
+    PyArrayMethod_Spec Spec = {
+            .name = "quad_comp",
+            .nin = 2,
+            .nout = 1,
+            .casting = NPY_SAFE_CASTING,
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        return -1;
+    }
+
+    PyObject *promoter_capsule =
+            PyCapsule_New((void *)&comparison_ufunc_promoter, "numpy._ufunc_promoter", NULL);
+    if (promoter_capsule == NULL) {
+        return -1;
+    }
+
+    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArray_BoolDType);
+    if (DTypes == 0) {
+        Py_DECREF(promoter_capsule);
+        return -1;
+    }
+
+    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
+        Py_DECREF(promoter_capsule);
+        Py_DECREF(DTypes);
+        return -1;
+    }
+    Py_DECREF(promoter_capsule);
+    Py_DECREF(DTypes);
+
+    return 0;
+}
+
+int
+init_quad_comps(PyObject *numpy)
+{
+    if (create_quad_comparison_ufunc<quad_equal, ld_equal>(numpy, "equal") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_notequal, ld_notequal>(numpy, "not_equal") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_less, ld_less>(numpy, "less") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_lessequal, ld_lessequal>(numpy, "less_equal") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_greater, ld_greater>(numpy, "greater") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_greaterequal, ld_greaterequal>(numpy, "greater_equal") <
+        0) {
+        return -1;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/comparison_ops.h b/quaddtype/numpy_quaddtype/src/umath/comparison_ops.h
new file mode 100644
index 0000000..e3b8cc0
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/comparison_ops.h
@@ -0,0 +1,9 @@
+#ifndef _QUADDTYPE_COMPARISON_OPS_H
+#define _QUADDTYPE_COMPARISON_OPS_H
+
+#include <Python.h>
+
+int
+init_quad_comps(PyObject *numpy);
+
+#endif
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/promoters.hpp b/quaddtype/numpy_quaddtype/src/umath/promoters.hpp
new file mode 100644
index 0000000..3b3c1ef
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/promoters.hpp
@@ -0,0 +1,90 @@
+#ifndef _QUADDTYPE_PROMOTERS
+#define _QUADDTYPE_PROMOTERS
+
+#include <Python.h>
+#include <cstdio>
+#include <cassert>
+#include "numpy/arrayobject.h"
+#include "numpy/ndarrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/dtype_api.h"
+
+#include "../dtype.h"
+
+inline int
+quad_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[],
+                    PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[])
+{
+    int nin = ufunc->nin;
+    int nargs = ufunc->nargs;
+    PyArray_DTypeMeta *common = NULL;
+    bool has_quad = false;
+
+    // Handle the special case for reductions
+    if (op_dtypes[0] == NULL) {
+        assert(nin == 2 && ufunc->nout == 1); /* must be reduction */
+        for (int i = 0; i < 3; i++) {
+            Py_INCREF(op_dtypes[1]);
+            new_op_dtypes[i] = op_dtypes[1];
+        }
+        return 0;
+    }
+
+    // Check if any input or signature is QuadPrecision
+    for (int i = 0; i < nin; i++) {
+        if (op_dtypes[i] == &QuadPrecDType) {
+            has_quad = true;
+        }
+    }
+
+    if (has_quad) {
+        common = &QuadPrecDType;
+    }
+    else {
+        for (int i = nin; i < nargs; i++) {
+            if (signature[i] != NULL) {
+                if (common == NULL) {
+                    Py_INCREF(signature[i]);
+                    common = signature[i];
+                }
+                else if (common != signature[i]) {
+                    Py_CLEAR(common);  // Not homogeneous, unset common
+                    break;
+                }
+            }
+        }
+    }
+    // If no common output dtype, use standard promotion for inputs
+    if (common == NULL) {
+        common = PyArray_PromoteDTypeSequence(nin, op_dtypes);
+        if (common == NULL) {
+            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+                PyErr_Clear();  // Do not propagate normal promotion errors
+            }
+
+            return -1;
+        }
+    }
+
+    // Set all new_op_dtypes to the common dtype
+    for (int i = 0; i < nargs; i++) {
+        if (signature[i]) {
+            // If signature is specified for this argument, use it
+            Py_INCREF(signature[i]);
+            new_op_dtypes[i] = signature[i];
+        }
+        else {
+            // Otherwise, use the common dtype
+            Py_INCREF(common);
+
+            new_op_dtypes[i] = common;
+        }
+    }
+
+    Py_XDECREF(common);
+
+    return 0;
+}
+
+
+#endif
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/umath.cpp b/quaddtype/numpy_quaddtype/src/umath/umath.cpp
new file mode 100644
index 0000000..2ea864e
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/umath.cpp
@@ -0,0 +1,115 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+extern "C" {
+#include <Python.h>
+#include <cstdio>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/dtype_api.h"
+}
+#include "../quad_common.h"
+#include "../scalar.h"
+#include "../dtype.h"
+#include "umath.h"
+#include "../ops.hpp"
+#include "unary_ops.h"
+#include "binary_ops.h"
+#include "comparison_ops.h"
+
+// helper debugging function
+static const char *
+get_dtype_name(PyArray_DTypeMeta *dtype)
+{
+    if (dtype == &QuadPrecDType) {
+        return "QuadPrecDType";
+    }
+    else if (dtype == &PyArray_BoolDType) {
+        return "BoolDType";
+    }
+    else if (dtype == &PyArray_ByteDType) {
+        return "ByteDType";
+    }
+    else if (dtype == &PyArray_UByteDType) {
+        return "UByteDType";
+    }
+    else if (dtype == &PyArray_ShortDType) {
+        return "ShortDType";
+    }
+    else if (dtype == &PyArray_UShortDType) {
+        return "UShortDType";
+    }
+    else if (dtype == &PyArray_IntDType) {
+        return "IntDType";
+    }
+    else if (dtype == &PyArray_UIntDType) {
+        return "UIntDType";
+    }
+    else if (dtype == &PyArray_LongDType) {
+        return "LongDType";
+    }
+    else if (dtype == &PyArray_ULongDType) {
+        return "ULongDType";
+    }
+    else if (dtype == &PyArray_LongLongDType) {
+        return "LongLongDType";
+    }
+    else if (dtype == &PyArray_ULongLongDType) {
+        return "ULongLongDType";
+    }
+    else if (dtype == &PyArray_FloatDType) {
+        return "FloatDType";
+    }
+    else if (dtype == &PyArray_DoubleDType) {
+        return "DoubleDType";
+    }
+    else if (dtype == &PyArray_LongDoubleDType) {
+        return "LongDoubleDType";
+    }
+    else {
+        return "UnknownDType";
+    }
+}
+
+int
+init_quad_umath(void)
+{
+    PyObject *numpy = PyImport_ImportModule("numpy");
+    if (!numpy) {
+        PyErr_SetString(PyExc_ImportError, "Failed to import numpy module");
+        return -1;
+    }
+
+    if (init_quad_unary_ops(numpy) < 0) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad unary operations");
+        goto err;
+    }
+
+    if (init_quad_binary_ops(numpy) < 0) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad binary operations");
+        goto err;
+    }
+
+    if (init_quad_comps(numpy) < 0) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad comparison operations");
+        goto err;
+    }
+
+    // if (init_quad_matmul(numpy) < 0) {
+    //     PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad matrix multiplication operations");
+    //     goto err;
+    // }
+
+    Py_DECREF(numpy);
+    return 0;
+
+err:
+    Py_DECREF(numpy);
+    return -1;
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath.h b/quaddtype/numpy_quaddtype/src/umath/umath.h
similarity index 95%
rename from quaddtype/numpy_quaddtype/src/umath.h
rename to quaddtype/numpy_quaddtype/src/umath/umath.h
index d64f26b..c9253ef 100644
--- a/quaddtype/numpy_quaddtype/src/umath.h
+++ b/quaddtype/numpy_quaddtype/src/umath/umath.h
@@ -12,4 +12,4 @@ init_quad_umath(void);
 }
 #endif
 
-#endif
+#endif
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/unary_ops.cpp b/quaddtype/numpy_quaddtype/src/umath/unary_ops.cpp
new file mode 100644
index 0000000..4c8f31f
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/unary_ops.cpp
@@ -0,0 +1,214 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+extern "C" {
+#include <Python.h>
+#include <cstdio>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/ufuncobject.h"
+
+#include "numpy/dtype_api.h"
+}
+#include "../quad_common.h"
+#include "../scalar.h"
+#include "../dtype.h"
+#include "../ops.hpp"
+
+static NPY_CASTING
+quad_unary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
+                                  PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
+                                  npy_intp *NPY_UNUSED(view_offset))
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    if (given_descrs[1] == NULL) {
+        Py_INCREF(given_descrs[0]);
+        loop_descrs[1] = given_descrs[0];
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    QuadPrecDTypeObject *descr_in = (QuadPrecDTypeObject *)given_descrs[0];
+    QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)loop_descrs[1];
+
+    if (descr_in->backend != descr_out->backend) {
+        return NPY_UNSAFE_CASTING;
+    }
+
+    return NPY_NO_CASTING;
+}
+
+template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
+int
+quad_generic_unary_op_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                             npy_intp const dimensions[], npy_intp const strides[],
+                                             NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in_ptr = data[0];
+    char *out_ptr = data[1];
+    npy_intp in_stride = strides[0];
+    npy_intp out_stride = strides[1];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    quad_value in, out;
+    while (N--) {
+        memcpy(&in, in_ptr, elem_size);
+        if (backend == BACKEND_SLEEF) {
+            out.sleef_value = sleef_op(&in.sleef_value);
+        }
+        else {
+            out.longdouble_value = longdouble_op(&in.longdouble_value);
+        }
+        memcpy(out_ptr, &out, elem_size);
+
+        in_ptr += in_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
+int
+quad_generic_unary_op_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                           npy_intp const dimensions[], npy_intp const strides[],
+                                           NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in_ptr = data[0];
+    char *out_ptr = data[1];
+    npy_intp in_stride = strides[0];
+    npy_intp out_stride = strides[1];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+
+    while (N--) {
+        if (backend == BACKEND_SLEEF) {
+            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in_ptr);
+        }
+        else {
+            *(long double *)out_ptr = longdouble_op((long double *)in_ptr);
+        }
+        in_ptr += in_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
+int
+create_quad_unary_ufunc(PyObject *numpy, const char *ufunc_name)
+{
+    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+
+    PyArray_DTypeMeta *dtypes[2] = {&QuadPrecDType, &QuadPrecDType};
+
+    PyType_Slot slots[] = {
+            {NPY_METH_resolve_descriptors, (void *)&quad_unary_op_resolve_descriptors},
+            {NPY_METH_strided_loop,
+             (void *)&quad_generic_unary_op_strided_loop_aligned<sleef_op, longdouble_op>},
+            {NPY_METH_unaligned_strided_loop,
+             (void *)&quad_generic_unary_op_strided_loop_unaligned<sleef_op, longdouble_op>},
+            {0, NULL}};
+
+    PyArrayMethod_Spec Spec = {
+            .name = "quad_unary_op",
+            .nin = 1,
+            .nout = 1,
+            .casting = NPY_NO_CASTING,
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+int
+init_quad_unary_ops(PyObject *numpy)
+{
+    if (create_quad_unary_ufunc<quad_negative, ld_negative>(numpy, "negative") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_positive, ld_positive>(numpy, "positive") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_absolute, ld_absolute>(numpy, "absolute") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_rint, ld_rint>(numpy, "rint") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_trunc, ld_trunc>(numpy, "trunc") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_floor, ld_floor>(numpy, "floor") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_ceil, ld_ceil>(numpy, "ceil") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_sqrt, ld_sqrt>(numpy, "sqrt") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_square, ld_square>(numpy, "square") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_log, ld_log>(numpy, "log") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_log2, ld_log2>(numpy, "log2") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_log10, ld_log10>(numpy, "log10") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_log1p, ld_log1p>(numpy, "log1p") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_exp, ld_exp>(numpy, "exp") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_exp2, ld_exp2>(numpy, "exp2") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_sin, ld_sin>(numpy, "sin") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_cos, ld_cos>(numpy, "cos") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_tan, ld_tan>(numpy, "tan") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_asin, ld_asin>(numpy, "arcsin") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_acos, ld_acos>(numpy, "arccos") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_atan, ld_atan>(numpy, "arctan") < 0) {
+        return -1;
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/unary_ops.h b/quaddtype/numpy_quaddtype/src/umath/unary_ops.h
new file mode 100644
index 0000000..6c3f17a
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/unary_ops.h
@@ -0,0 +1,9 @@
+#ifndef _QUADDTYPE_UNARY_OPS_H
+#define _QUADDTYPE_UNARY_OPS_H
+
+#include <Python.h>
+
+int
+init_quad_unary_ops(PyObject *numpy);
+
+#endif

From a35abce6cdcfbcdf9b660c41b21c5ce9be9c8c15 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 16 Jul 2025 12:13:56 +0000
Subject: [PATCH 27/49] updaing ci

---
 .github/workflows/build_wheels.yml |  2 +-
 .github/workflows/ci.yml           | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 37a8038..e16e2ff 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -3,7 +3,7 @@ name: Build Wheels
 on:
   push:
     branches:
-      - main
+      - matmul-ufunc
     tags:
       - "quaddtype-v*"
     paths:
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9267d9f..0b4f219 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,7 +3,7 @@ name: Numpy User DTypes CI
 on:
   push:
     branches:
-      - main
+      - matmul-ufunc
   pull_request:
   workflow_dispatch:
 
@@ -61,15 +61,21 @@ jobs:
           sudo apt-get install -y libmpfr-dev libssl-dev libfftw3-dev
       - name: Install SLEEF
         run: |
+          yum update -y
+          yum install -y cmake gcc gcc-c++ make git pkgconfig
           git clone --branch 3.8 https://github.com/shibatch/sleef.git
           cd sleef
           cmake -S . -B build -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
           cmake --build build/ --clean-first -j
-          sudo cmake --install build --prefix /usr
+          sudo cmake --install build --prefix /usr/local
       - name: Install quaddtype
         working-directory: quaddtype
         run: |
-          LDFLAGS="-Wl,-rpath,/usr/lib" python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug"
+            CFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CFLAGS"
+            CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -fext-numeric-literals $CXXFLAGS"
+            LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp $LDFLAGS"
+            LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
+            python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug"
       - name: Run quaddtype tests
         working-directory: quaddtype
         run: |

From 851654472afaff41c5b4a56701c1ea8de644b522 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 16 Jul 2025 12:16:54 +0000
Subject: [PATCH 28/49] switching to apt

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0b4f219..7a1b7e9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -61,8 +61,8 @@ jobs:
           sudo apt-get install -y libmpfr-dev libssl-dev libfftw3-dev
       - name: Install SLEEF
         run: |
-          yum update -y
-          yum install -y cmake gcc gcc-c++ make git pkgconfig
+          sudo apt-get update -y
+          sudo apt-get install -y cmake gcc g++ make git pkg-config
           git clone --branch 3.8 https://github.com/shibatch/sleef.git
           cd sleef
           cmake -S . -B build -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON

From 335f42534d2ec92130cb03a5d5dfa1916eab5045 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 16 Jul 2025 12:22:01 +0000
Subject: [PATCH 29/49] submodule fix

---
 .github/workflows/ci.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7a1b7e9..d2aa43f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -75,6 +75,9 @@ jobs:
             CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -fext-numeric-literals $CXXFLAGS"
             LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp $LDFLAGS"
             LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
+            
+            git submodule update --init --recursive
+            ls -la quaddtype/numpy_quaddtype/QBLAS/
             python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug"
       - name: Run quaddtype tests
         working-directory: quaddtype

From 85e7840aab3737622203d0b62100df69d05b175e Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 16 Jul 2025 12:26:37 +0000
Subject: [PATCH 30/49] submodule fix

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d2aa43f..ff64875 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -77,7 +77,7 @@ jobs:
             LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
             
             git submodule update --init --recursive
-            ls -la quaddtype/numpy_quaddtype/QBLAS/
+            ls -la numpy_quaddtype/QBLAS/
             python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug"
       - name: Run quaddtype tests
         working-directory: quaddtype

From e467f4b01c09c45d5bd0f3cefc80cc64c45b4772 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Wed, 16 Jul 2025 12:32:42 +0000
Subject: [PATCH 31/49] submodule fix

---
 .github/workflows/ci.yml | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ff64875..a70e8c1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -59,6 +59,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install -y libmpfr-dev libssl-dev libfftw3-dev
+
       - name: Install SLEEF
         run: |
           sudo apt-get update -y
@@ -68,17 +69,27 @@ jobs:
           cmake -S . -B build -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
           cmake --build build/ --clean-first -j
           sudo cmake --install build --prefix /usr/local
+
       - name: Install quaddtype
         working-directory: quaddtype
         run: |
-            CFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include $CFLAGS"
-            CXXFLAGS="-I/usr/local/include -I{project}/numpy_quaddtype/QBLAS/include -fext-numeric-literals $CXXFLAGS"
-            LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp $LDFLAGS"
-            LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
-            
-            git submodule update --init --recursive
-            ls -la numpy_quaddtype/QBLAS/
-            python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug"
+          # Initialize submodules first
+          git submodule update --init --recursive
+          ls -la numpy_quaddtype/QBLAS/
+          
+          # Set environment variables with proper export and correct paths
+          export CFLAGS="-I/usr/local/include -I$(pwd)/numpy_quaddtype/QBLAS/include"
+          export CXXFLAGS="-I/usr/local/include -I$(pwd)/numpy_quaddtype/QBLAS/include -fext-numeric-literals"
+          export LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp"
+          export LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
+          
+          # Install with meson args to ensure the C++ flags are passed through
+          python -m pip install . -v --no-build-isolation \
+            -Cbuilddir=build \
+            -C'compile-args=-v' \
+            -Csetup-args="-Dbuildtype=debug" \
+            -Csetup-args="-Dcpp_args=-fext-numeric-literals"
+
       - name: Run quaddtype tests
         working-directory: quaddtype
         run: |

From e201b90f7d5f299ab41cf04d92d9045d5da9ae21 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Thu, 17 Jul 2025 07:16:29 +0000
Subject: [PATCH 32/49] initial matmul ufunc setup

---
 quaddtype/meson.build                         |   2 +
 .../src/quadblas_interface.cpp                |  91 +++++++++++
 .../numpy_quaddtype/src/umath/matmul.cpp      | 148 ++++++++++++++++++
 quaddtype/numpy_quaddtype/src/umath/matmul.h  |   8 +
 quaddtype/numpy_quaddtype/src/umath/umath.cpp |   9 +-
 5 files changed, 254 insertions(+), 4 deletions(-)
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/matmul.cpp
 create mode 100644 quaddtype/numpy_quaddtype/src/umath/matmul.h

diff --git a/quaddtype/meson.build b/quaddtype/meson.build
index 66318d6..c000675 100644
--- a/quaddtype/meson.build
+++ b/quaddtype/meson.build
@@ -63,6 +63,8 @@ srcs = [
     'numpy_quaddtype/src/umath/comparison_ops.h',
     'numpy_quaddtype/src/umath/comparison_ops.cpp',
     'numpy_quaddtype/src/umath/promoters.hpp',
+    'numpy_quaddtype/src/umath/matmul.h',
+    'numpy_quaddtype/src/umath/matmul.cpp',
 ]
 
 py.install_sources(
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index 6bc3fb0..cce39c3 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -786,4 +786,95 @@ py_quadblas_get_version(PyObject *self, PyObject *args)
     return PyUnicode_FromString(QuadBLAS::VERSION);
 }
 
+void matmul_op(Sleef_quad * inp1, Sleef_quad *inp2, Sleef_quad *out)
+{
+    Sleef_quad *data_a, *data_b;
+    QuadBackendType backend_a, backend_b;
+    QuadBLAS::Layout layout_a, layout_b;
+
+    if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
+        !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
+        return nullptr;
+    }
+
+    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
+    Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
+    Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
+
+    if (!sleef_a || !sleef_b) {
+        QuadBLAS::aligned_free(temp_a);
+        QuadBLAS::aligned_free(temp_b);
+        return nullptr;
+    }
+
+    QuadBackendType result_backend = BACKEND_SLEEF;
+    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
+        result_backend = BACKEND_LONGDOUBLE;
+    }
+
+    npy_intp result_dims[2] = {m, n};
+    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
+    if (!result_dtype) {
+        QuadBLAS::aligned_free(temp_a);
+        QuadBLAS::aligned_free(temp_b);
+        return nullptr;
+    }
+
+    PyArrayObject *result =
+            (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
+    if (!result) {
+        QuadBLAS::aligned_free(temp_a);
+        QuadBLAS::aligned_free(temp_b);
+        Py_DECREF(result_dtype);
+        return nullptr;
+    }
+
+    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
+    for (npy_intp i = 0; i < m * n; i++) {
+        result_data[i] = Sleef_cast_from_doubleq1(0.0);
+    }
+
+    npy_intp lda, ldb, ldc;
+
+    if (layout_a == QuadBLAS::Layout::RowMajor) {
+        lda = k;
+    }
+    else {
+        lda = m;
+    }
+
+    if (layout_b == QuadBLAS::Layout::RowMajor) {
+        ldb = n;
+    }
+    else {
+        ldb = k;
+    }
+
+    QuadBLAS::Layout result_layout = layout_a;
+    if (result_layout == QuadBLAS::Layout::RowMajor) {
+        ldc = n;
+    }
+    else {
+        ldc = m;
+    }
+
+    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
+    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
+
+    QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data,
+                   ldc);
+
+    if (result_backend == BACKEND_LONGDOUBLE) {
+        long double *ld_result = (long double *)PyArray_DATA(result);
+        for (npy_intp i = 0; i < m * n; i++) {
+            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+        }
+    }
+
+    QuadBLAS::aligned_free(temp_a);
+    QuadBLAS::aligned_free(temp_b);
+
+    return (PyObject *)result;
+}
+
 #endif  // DISABLE_QUADBLAS
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
new file mode 100644
index 0000000..47fdf5f
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -0,0 +1,148 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+
+#include <Python.h>
+#include <cstdio>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/dtype_api.h"
+#include "numpy/ndarraytypes.h"
+
+#include "../quad_common.h"
+#include "../scalar.h"
+#include "../dtype.h"
+#include "../ops.hpp"
+#include "binary_ops.h"
+#include "matmul.h"
+
+#include <iostream>
+
+static NPY_CASTING
+quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
+                                   PyArray_Descr *const given_descrs[],
+                                   PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset))
+{
+
+  NPY_CASTING casting = NPY_NO_CASTING;    
+  std::cout << "exiting the descriptor";
+  return casting;
+}
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+int
+quad_generic_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                          npy_intp const dimensions[], npy_intp const strides[],
+                                          NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    quad_value in1, in2, out;
+    while (N--) {
+        memcpy(&in1, in1_ptr, elem_size);
+        memcpy(&in2, in2_ptr, elem_size);
+        if (backend == BACKEND_SLEEF) {
+            out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value);
+        }
+        else {
+            out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value);
+        }
+        memcpy(out_ptr, &out, elem_size);
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+int
+quad_generic_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                        npy_intp const dimensions[], npy_intp const strides[],
+                                        NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+
+    while (N--) {
+        if (backend == BACKEND_SLEEF) {
+            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr);
+        }
+        else {
+            *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr);
+        }
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+int
+create_matmul_ufunc(PyObject *numpy, const char *ufunc_name)
+{
+    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+
+    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
+
+    PyType_Slot slots[] = {
+            {NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
+            {NPY_METH_strided_loop,
+             (void *)&quad_generic_matmul_strided_loop_aligned<sleef_op, longdouble_op>},
+            {NPY_METH_unaligned_strided_loop,
+             (void *)&quad_generic_matmul_strided_loop_unaligned<sleef_op, longdouble_op>},
+            {0, NULL}};
+
+    PyArrayMethod_Spec Spec = {
+            .name = "quad_matmul",
+            .nin = 2,
+            .nout = 1,
+            .casting = NPY_NO_CASTING,
+            .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE),
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        return -1;
+    }
+    // my guess we don't need any promoter here as of now, since matmul is quad specific
+    return 0;
+}
+
+
+int
+init_matmul_ops(PyObject *numpy)
+{
+    if (create_matmul_ufunc<quad_add>(numpy, "matmul") < 0) {
+        return -1;
+    }
+    return 0;
+}
+
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.h b/quaddtype/numpy_quaddtype/src/umath/matmul.h
new file mode 100644
index 0000000..bc099eb
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.h
@@ -0,0 +1,8 @@
+#ifndef _QUADDTYPE_MATMUL_OPS_H
+#define _QUADDTYPE_MATMUL_OPS_H
+
+#include <Python.h>
+
+int
+init_matmul_ops(PyObject *numpy);
+#endif
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/umath.cpp b/quaddtype/numpy_quaddtype/src/umath/umath.cpp
index 2ea864e..50f9562 100644
--- a/quaddtype/numpy_quaddtype/src/umath/umath.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/umath.cpp
@@ -22,6 +22,7 @@ extern "C" {
 #include "unary_ops.h"
 #include "binary_ops.h"
 #include "comparison_ops.h"
+#include "matmul.h"
 
 // helper debugging function
 static const char *
@@ -101,10 +102,10 @@ init_quad_umath(void)
         goto err;
     }
 
-    // if (init_quad_matmul(numpy) < 0) {
-    //     PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad matrix multiplication operations");
-    //     goto err;
-    // }
+    if (init_matmul_ops(numpy) < 0) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad matrix multiplication operations");
+        goto err;
+    }
 
     Py_DECREF(numpy);
     return 0;

From 09918a310859e7879b3d2e2120e87a86e0ca0605 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Thu, 17 Jul 2025 15:50:04 +0530
Subject: [PATCH 33/49] mid-way test

---
 quaddtype/numpy_quaddtype/QBLAS               |   2 +-
 .../src/quadblas_interface.cpp                | 180 +++++++++---------
 .../numpy_quaddtype/src/umath/matmul.cpp      |  25 ++-
 quaddtype/release_tracker.md                  |  93 +++++++++
 4 files changed, 195 insertions(+), 105 deletions(-)
 create mode 100644 quaddtype/release_tracker.md

diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS
index 0eabb67..4853ac1 160000
--- a/quaddtype/numpy_quaddtype/QBLAS
+++ b/quaddtype/numpy_quaddtype/QBLAS
@@ -1 +1 @@
-Subproject commit 0eabb677431c6148434c50deba7abd6902d74b16
+Subproject commit 4853ac1c7d3fa3016b61e9f2b9a43f49c06d891d
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index cce39c3..46f953d 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -786,95 +786,95 @@ py_quadblas_get_version(PyObject *self, PyObject *args)
     return PyUnicode_FromString(QuadBLAS::VERSION);
 }
 
-void matmul_op(Sleef_quad * inp1, Sleef_quad *inp2, Sleef_quad *out)
-{
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-    QuadBLAS::Layout layout_a, layout_b;
-
-    if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
-        !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    npy_intp result_dims[2] = {m, n};
-    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
-    if (!result_dtype) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    PyArrayObject *result =
-            (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
-    if (!result) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        Py_DECREF(result_dtype);
-        return nullptr;
-    }
-
-    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
-    for (npy_intp i = 0; i < m * n; i++) {
-        result_data[i] = Sleef_cast_from_doubleq1(0.0);
-    }
-
-    npy_intp lda, ldb, ldc;
-
-    if (layout_a == QuadBLAS::Layout::RowMajor) {
-        lda = k;
-    }
-    else {
-        lda = m;
-    }
-
-    if (layout_b == QuadBLAS::Layout::RowMajor) {
-        ldb = n;
-    }
-    else {
-        ldb = k;
-    }
-
-    QuadBLAS::Layout result_layout = layout_a;
-    if (result_layout == QuadBLAS::Layout::RowMajor) {
-        ldc = n;
-    }
-    else {
-        ldc = m;
-    }
-
-    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
-    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
-
-    QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data,
-                   ldc);
-
-    if (result_backend == BACKEND_LONGDOUBLE) {
-        long double *ld_result = (long double *)PyArray_DATA(result);
-        for (npy_intp i = 0; i < m * n; i++) {
-            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
-        }
-    }
-
-    QuadBLAS::aligned_free(temp_a);
-    QuadBLAS::aligned_free(temp_b);
-
-    return (PyObject *)result;
-}
+// void matmul_op(Sleef_quad * inp1, Sleef_quad *inp2, Sleef_quad *out)
+// {
+//     Sleef_quad *data_a, *data_b;
+//     QuadBackendType backend_a, backend_b;
+//     QuadBLAS::Layout layout_a, layout_b;
+
+//     if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
+//         !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
+//         return nullptr;
+//     }
+
+//     Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
+//     Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
+//     Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
+
+//     if (!sleef_a || !sleef_b) {
+//         QuadBLAS::aligned_free(temp_a);
+//         QuadBLAS::aligned_free(temp_b);
+//         return nullptr;
+//     }
+
+//     QuadBackendType result_backend = BACKEND_SLEEF;
+//     if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
+//         result_backend = BACKEND_LONGDOUBLE;
+//     }
+
+//     npy_intp result_dims[2] = {m, n};
+//     QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
+//     if (!result_dtype) {
+//         QuadBLAS::aligned_free(temp_a);
+//         QuadBLAS::aligned_free(temp_b);
+//         return nullptr;
+//     }
+
+//     PyArrayObject *result =
+//             (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
+//     if (!result) {
+//         QuadBLAS::aligned_free(temp_a);
+//         QuadBLAS::aligned_free(temp_b);
+//         Py_DECREF(result_dtype);
+//         return nullptr;
+//     }
+
+//     Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
+//     for (npy_intp i = 0; i < m * n; i++) {
+//         result_data[i] = Sleef_cast_from_doubleq1(0.0);
+//     }
+
+//     npy_intp lda, ldb, ldc;
+
+//     if (layout_a == QuadBLAS::Layout::RowMajor) {
+//         lda = k;
+//     }
+//     else {
+//         lda = m;
+//     }
+
+//     if (layout_b == QuadBLAS::Layout::RowMajor) {
+//         ldb = n;
+//     }
+//     else {
+//         ldb = k;
+//     }
+
+//     QuadBLAS::Layout result_layout = layout_a;
+//     if (result_layout == QuadBLAS::Layout::RowMajor) {
+//         ldc = n;
+//     }
+//     else {
+//         ldc = m;
+//     }
+
+//     Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
+//     Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
+
+//     QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data,
+//                    ldc);
+
+//     if (result_backend == BACKEND_LONGDOUBLE) {
+//         long double *ld_result = (long double *)PyArray_DATA(result);
+//         for (npy_intp i = 0; i < m * n; i++) {
+//             ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+//         }
+//     }
+
+//     QuadBLAS::aligned_free(temp_a);
+//     QuadBLAS::aligned_free(temp_b);
+
+//     return (PyObject *)result;
+// }
 
 #endif  // DISABLE_QUADBLAS
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index 47fdf5f..01b8935 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -5,7 +5,6 @@
 #define NO_IMPORT_ARRAY
 #define NO_IMPORT_UFUNC
 
-
 #include <Python.h>
 #include <cstdio>
 
@@ -25,20 +24,19 @@
 
 static NPY_CASTING
 quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
-                                   PyArray_Descr *const given_descrs[],
-                                   PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset))
+                                PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
+                                npy_intp *NPY_UNUSED(view_offset))
 {
-
-  NPY_CASTING casting = NPY_NO_CASTING;    
-  std::cout << "exiting the descriptor";
-  return casting;
+    NPY_CASTING casting = NPY_NO_CASTING;
+    std::cout << "exiting the descriptor";
+    return casting;
 }
 
 template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
 int
 quad_generic_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
-                                          npy_intp const dimensions[], npy_intp const strides[],
-                                          NpyAuxData *auxdata)
+                                           npy_intp const dimensions[], npy_intp const strides[],
+                                           NpyAuxData *auxdata)
 {
     npy_intp N = dimensions[0];
     char *in1_ptr = data[0], *in2_ptr = data[1];
@@ -73,8 +71,8 @@ quad_generic_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char
 template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
 int
 quad_generic_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
-                                        npy_intp const dimensions[], npy_intp const strides[],
-                                        NpyAuxData *auxdata)
+                                         npy_intp const dimensions[], npy_intp const strides[],
+                                         NpyAuxData *auxdata)
 {
     npy_intp N = dimensions[0];
     char *in1_ptr = data[0], *in2_ptr = data[1];
@@ -101,6 +99,7 @@ quad_generic_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *c
     return 0;
 }
 
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
 int
 create_matmul_ufunc(PyObject *numpy, const char *ufunc_name)
 {
@@ -136,13 +135,11 @@ create_matmul_ufunc(PyObject *numpy, const char *ufunc_name)
     return 0;
 }
 
-
 int
 init_matmul_ops(PyObject *numpy)
 {
-    if (create_matmul_ufunc<quad_add>(numpy, "matmul") < 0) {
+    if (create_matmul_ufunc<quad_add, ld_add>(numpy, "matmul") < 0) {
         return -1;
     }
     return 0;
 }
-
diff --git a/quaddtype/release_tracker.md b/quaddtype/release_tracker.md
new file mode 100644
index 0000000..1ecf7d3
--- /dev/null
+++ b/quaddtype/release_tracker.md
@@ -0,0 +1,93 @@
+# Plan for `numpy-quaddtype` v1.5
+
+| ufunc name    | Added |
+| ------------- | ----- |
+| add           | ✅    |
+| subtract      | ✅    |
+| multiply      | ✅    |
+| matmul        | #116  |
+| divide        | ✅    |
+| logaddexp     |       |
+| logaddexp2    |       |
+| true_divide   |       |
+| floor_divide  |       |
+| negative      | ✅    |
+| positive      | ✅    |
+| power         | ✅    |
+| float_power   |       |
+| remainder     |       |
+| mod           | ✅    |
+| fmod          |       |
+| divmod        |       |
+| absolute      | ✅    |
+| fabs          |       |
+| rint          | ✅    |
+| sign          |       |
+| heaviside     |       |
+| conj          |       |
+| conjugate     |       |
+| exp           | ✅    |
+| exp2          | ✅    |
+| log           | ✅    |
+| log2          | ✅    |
+| log10         | ✅    |
+| expm1         |       |
+| log1p         | ✅    |
+| sqrt          | ✅    |
+| square        | ✅    |
+| cbrt          |       |
+| reciprocal    |       |
+| gcd           |       |
+| lcm           |       |
+| sin           | ✅    |
+| cos           | ✅    |
+| tan           | ✅    |
+| arcsin        | ✅    |
+| arccos        | ✅    |
+| arctan        | ✅    |
+| arctan2       | ✅    |
+| hypot         |       |
+| sinh          |       |
+| cosh          |       |
+| tanh          |       |
+| arcsinh       |       |
+| arccosh       |       |
+| arctanh       |       |
+| degrees       |       |
+| radians       |       |
+| deg2rad       |       |
+| rad2deg       |       |
+| bitwise_and   |       |
+| bitwise_or    |       |
+| bitwise_xor   |       |
+| invert        |       |
+| left_shift    |       |
+| right_shift   |       |
+| greater       | ✅    |
+| greater_equal | ✅    |
+| less          | ✅    |
+| less_equal    | ✅    |
+| not_equal     | ✅    |
+| equal         | ✅    |
+| logical_and   |       |
+| logical_or    |       |
+| logical_xor   |       |
+| logical_not   |       |
+| maximum       | ✅    |
+| minimum       | ✅    |
+| fmax          |       |
+| fmin          |       |
+| isfinite      |       |
+| isinf         |       |
+| isnan         |       |
+| isnat         |       |
+| signbit       |       |
+| copysign      |       |
+| nextafter     |       |
+| spacing       |       |
+| modf          |       |
+| ldexp         |       |
+| frexp         |       |
+| floor         | ✅    |
+| ceil          | ✅    |
+| trunc         | ✅    |

From 70ca6446bf8146759a0ca21710f0a908ff3e1eb9 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Thu, 17 Jul 2025 19:47:32 +0530
Subject: [PATCH 34/49] shifting to matmul ufunc

---
 quaddtype/numpy_quaddtype/__init__.py         |  3 +-
 .../numpy_quaddtype/src/quaddtype_main.c      | 69 ++++++++++---------
 quaddtype/tests/test_dot.py                   | 67 ++++++++----------
 3 files changed, 66 insertions(+), 73 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/__init__.py b/quaddtype/numpy_quaddtype/__init__.py
index b0a9f3b..8b588c1 100644
--- a/quaddtype/numpy_quaddtype/__init__.py
+++ b/quaddtype/numpy_quaddtype/__init__.py
@@ -3,7 +3,6 @@
     QuadPrecDType,
     is_longdouble_128,
     get_sleef_constant,
-    qblas_dot as dot,
     set_num_threads,
     get_num_threads,
     get_quadblas_version
@@ -17,7 +16,7 @@
     # Constants
     'pi', 'e', 'log2e', 'log10e', 'ln2', 'ln10', 'max_value', 'min_value', 'epsilon', 
     # QuadBLAS related functions
-    'dot', 'set_num_threads', 'get_num_threads', 'get_quadblas_version'
+    'set_num_threads', 'get_num_threads', 'get_quadblas_version'
 ]
 
 def SleefQuadPrecision(value):
diff --git a/quaddtype/numpy_quaddtype/src/quaddtype_main.c b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
index 641200d..1e8fd53 100644
--- a/quaddtype/numpy_quaddtype/src/quaddtype_main.c
+++ b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
@@ -19,45 +19,55 @@
 #include "quadblas_interface.h"
 #include "float.h"
 
-
-static PyObject* py_is_longdouble_128(PyObject* self, PyObject* args) {
-    if(sizeof(long double) == 16 && 
-        LDBL_MANT_DIG == 113 && 
-        LDBL_MAX_EXP == 16384) {
+static PyObject *
+py_is_longdouble_128(PyObject *self, PyObject *args)
+{
+    if (sizeof(long double) == 16 && LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384) {
         Py_RETURN_TRUE;
-    } else {
+    }
+    else {
         Py_RETURN_FALSE;
     }
 }
 
-static PyObject* get_sleef_constant(PyObject* self, PyObject* args) {
-    const char* constant_name;
+static PyObject *
+get_sleef_constant(PyObject *self, PyObject *args)
+{
+    const char *constant_name;
     if (!PyArg_ParseTuple(args, "s", &constant_name)) {
         return NULL;
     }
 
-    QuadPrecisionObject* result = QuadPrecision_raw_new(BACKEND_SLEEF);
+    QuadPrecisionObject *result = QuadPrecision_raw_new(BACKEND_SLEEF);
     if (result == NULL) {
         return NULL;
     }
 
     if (strcmp(constant_name, "pi") == 0) {
         result->value.sleef_value = SLEEF_M_PIq;
-    } else if (strcmp(constant_name, "e") == 0) {
+    }
+    else if (strcmp(constant_name, "e") == 0) {
         result->value.sleef_value = SLEEF_M_Eq;
-    } else if (strcmp(constant_name, "log2e") == 0) {
+    }
+    else if (strcmp(constant_name, "log2e") == 0) {
         result->value.sleef_value = SLEEF_M_LOG2Eq;
-    } else if (strcmp(constant_name, "log10e") == 0) {
+    }
+    else if (strcmp(constant_name, "log10e") == 0) {
         result->value.sleef_value = SLEEF_M_LOG10Eq;
-    } else if (strcmp(constant_name, "ln2") == 0) {
+    }
+    else if (strcmp(constant_name, "ln2") == 0) {
         result->value.sleef_value = SLEEF_M_LN2q;
-    } else if (strcmp(constant_name, "ln10") == 0) {
+    }
+    else if (strcmp(constant_name, "ln10") == 0) {
         result->value.sleef_value = SLEEF_M_LN10q;
-    } else if (strcmp(constant_name, "quad_max") == 0) {
+    }
+    else if (strcmp(constant_name, "quad_max") == 0) {
         result->value.sleef_value = SLEEF_QUAD_MAX;
-    } else if (strcmp(constant_name, "quad_min") == 0) {
+    }
+    else if (strcmp(constant_name, "quad_min") == 0) {
         result->value.sleef_value = SLEEF_QUAD_MIN;
-    } else if (strcmp(constant_name, "epsilon") == 0) {
+    }
+    else if (strcmp(constant_name, "epsilon") == 0) {
         result->value.sleef_value = SLEEF_QUAD_EPSILON;
     }
     else {
@@ -66,26 +76,23 @@ static PyObject* get_sleef_constant(PyObject* self, PyObject* args) {
         return NULL;
     }
 
-    return (PyObject*)result;
+    return (PyObject *)result;
 }
 
 static PyMethodDef module_methods[] = {
-    {"is_longdouble_128", py_is_longdouble_128, METH_NOARGS, "Check if long double is 128-bit"},
-    {"get_sleef_constant", get_sleef_constant, METH_VARARGS, "Get Sleef constant by name"},
-    {"qblas_dot", py_quadblas_dot, METH_VARARGS, "Optimized dot product using QuadBLAS"},
-    {"set_num_threads", py_quadblas_set_num_threads, METH_VARARGS, "Set number of threads for QuadBLAS"},
-    {"get_num_threads", py_quadblas_get_num_threads, METH_NOARGS, "Get number of threads for QuadBLAS"},
-    {"get_quadblas_version", py_quadblas_get_version, METH_NOARGS, "Get QuadBLAS version"},
-    {NULL, NULL, 0, NULL} 
-};
+        {"is_longdouble_128", py_is_longdouble_128, METH_NOARGS, "Check if long double is 128-bit"},
+        {"get_sleef_constant", get_sleef_constant, METH_VARARGS, "Get Sleef constant by name"},
+        {"set_num_threads", py_quadblas_set_num_threads, METH_VARARGS,
+         "Set number of threads for QuadBLAS"},
+        {"get_num_threads", py_quadblas_get_num_threads, METH_NOARGS,
+         "Get number of threads for QuadBLAS"},
+        {"get_quadblas_version", py_quadblas_get_version, METH_NOARGS, "Get QuadBLAS version"},
+        {NULL, NULL, 0, NULL}};
 
 static struct PyModuleDef moduledef = {
-        PyModuleDef_HEAD_INIT,
-        .m_name = "_quaddtype_main",
+        PyModuleDef_HEAD_INIT, .m_name = "_quaddtype_main",
         .m_doc = "Quad (128-bit) floating point Data Type for NumPy with multiple backends",
-        .m_size = -1,
-        .m_methods = module_methods
-};
+        .m_size = -1, .m_methods = module_methods};
 
 PyMODINIT_FUNC
 PyInit__quaddtype_main(void)
diff --git a/quaddtype/tests/test_dot.py b/quaddtype/tests/test_dot.py
index ed135f4..f3fa3f6 100644
--- a/quaddtype/tests/test_dot.py
+++ b/quaddtype/tests/test_dot.py
@@ -1,19 +1,6 @@
-"""
-Focused test suite for the dot function in numpy_quaddtype
-
-This module tests the QuadBLAS dot function for:
-- Vector-vector dot products
-- Matrix-vector multiplication  
-- Matrix-matrix multiplication
-- Small and large matrix operations
-- Basic correctness validation
-
-Uses only the Sleef backend for simplicity.
-"""
-
 import pytest
 import numpy as np
-from numpy_quaddtype import QuadPrecision, QuadPrecDType, dot
+from numpy_quaddtype import QuadPrecision, QuadPrecDType
 
 
 # ================================================================================
@@ -81,14 +68,14 @@ def create_quad_array(values, shape=None):
 # ================================================================================
 
 class TestVectorVectorDot:
-    """Test vector-vector dot products"""
+    """Test vector-vector np.matmul products"""
     
     def test_simple_dot_product(self):
-        """Test basic vector dot product"""
+        """Test basic vector np.matmul product"""
         x = create_quad_array([1, 2, 3])
         y = create_quad_array([4, 5, 6])
         
-        result = dot(x, y)
+        result = np.matmul(x, y)
         expected = 1*4 + 2*5 + 3*6  # = 32
         
         assert isinstance(result, QuadPrecision)
@@ -99,14 +86,14 @@ def test_orthogonal_vectors(self):
         x = create_quad_array([1, 0, 0])
         y = create_quad_array([0, 1, 0])
         
-        result = dot(x, y)
+        result = np.matmul(x, y)
         assert_quad_equal(result, 0.0)
     
     def test_same_vector(self):
-        """Test dot product of vector with itself"""
+        """Test np.matmul product of vector with itself"""
         x = create_quad_array([2, 3, 4])
         
-        result = dot(x, x)
+        result = np.matmul(x, x)
         expected = 2*2 + 3*3 + 4*4  # = 29
         
         assert_quad_equal(result, expected)
@@ -121,7 +108,7 @@ def test_various_vector_sizes(self, size):
         x = create_quad_array(x_vals)
         y = create_quad_array(y_vals)
         
-        result = dot(x, y)
+        result = np.matmul(x, y)
         expected = sum(x_vals[i] * y_vals[i] for i in range(size))
         
         assert_quad_equal(result, expected)
@@ -131,7 +118,7 @@ def test_negative_and_fractional_values(self):
         x = create_quad_array([1.5, -2.5, 3.25])
         y = create_quad_array([-1.25, 2.75, -3.5])
         
-        result = dot(x, y)
+        result = np.matmul(x, y)
         expected = 1.5*(-1.25) + (-2.5)*2.75 + 3.25*(-3.5)
         
         assert_quad_equal(result, expected)
@@ -151,7 +138,7 @@ def test_simple_matrix_vector(self):
         # 3x1 vector  
         x = create_quad_array([1, 1, 1])
         
-        result = dot(A, x)
+        result = np.matmul(A, x)
         expected = [1+2+3, 4+5+6]  # [6, 15]
         
         assert result.shape == (2,)
@@ -164,7 +151,7 @@ def test_identity_matrix_vector(self):
         I = create_quad_array([1, 0, 0, 0, 1, 0, 0, 0, 1], shape=(3, 3))
         x = create_quad_array([2, 3, 4])
         
-        result = dot(I, x)
+        result = np.matmul(I, x)
         
         assert result.shape == (3,)
         for i in range(3):
@@ -181,7 +168,7 @@ def test_various_matrix_vector_sizes(self, m, n):
         x_vals = [i + 1 for i in range(n)]
         x = create_quad_array(x_vals)
         
-        result = dot(A, x)
+        result = np.matmul(A, x)
         
         assert result.shape == (m,)
         
@@ -205,7 +192,7 @@ def test_simple_matrix_matrix(self):
         A = create_quad_array([1, 2, 3, 4], shape=(2, 2))
         B = create_quad_array([5, 6, 7, 8], shape=(2, 2))
         
-        result = dot(A, B)
+        result = np.matmul(A, B)
         
         # Expected: [[1*5+2*7, 1*6+2*8], [3*5+4*7, 3*6+4*8]] = [[19, 22], [43, 50]]
         expected = [[19, 22], [43, 50]]
@@ -221,11 +208,11 @@ def test_identity_matrix_multiplication(self):
         I = create_quad_array([1, 0, 0, 1], shape=(2, 2))
         
         # A * I should equal A
-        result1 = dot(A, I)
+        result1 = np.matmul(A, I)
         assert_quad_array_equal(result1, A)
         
         # I * A should equal A  
-        result2 = dot(I, A)
+        result2 = np.matmul(I, A)
         assert_quad_array_equal(result2, A)
     
     @pytest.mark.parametrize("m,n,k", [(2,2,2), (2,3,4), (3,2,5), (4,4,4), (5,6,7)])
@@ -239,7 +226,7 @@ def test_various_matrix_sizes(self, m, n, k):
         B_vals = [(i*n + j + 1) for i in range(k) for j in range(n)]
         B = create_quad_array(B_vals, shape=(k, n))
         
-        result = dot(A, B)
+        result = np.matmul(A, B)
         
         assert result.shape == (m, n)
         
@@ -258,12 +245,12 @@ def test_associativity(self):
         C = create_quad_array([1, 1, 2, 1], shape=(2, 2))
         
         # Compute (A*B)*C
-        AB = dot(A, B)
-        result1 = dot(AB, C)
+        AB = np.matmul(A, B)
+        result1 = np.matmul(AB, C)
         
         # Compute A*(B*C)
-        BC = dot(B, C)
-        result2 = dot(A, BC)
+        BC = np.matmul(B, C)
+        result2 = np.matmul(A, BC)
         
         assert_quad_array_equal(result1, result2, rtol=1e-25)
 
@@ -285,7 +272,7 @@ def test_large_square_matrices(self, size):
         A = create_quad_array(A_vals, shape=(size, size))
         B = create_quad_array(B_vals, shape=(size, size))
         
-        result = dot(A, B)
+        result = np.matmul(A, B)
         
         assert result.shape == (size, size)
         
@@ -303,7 +290,7 @@ def test_large_square_matrices(self, size):
             assert_quad_equal(result[size//2, size//2], expected_value, rtol=1e-15, atol=1e-15)
     
     def test_large_vector_operations(self):
-        """Test large vector dot products"""
+        """Test large vector np.matmul products"""
         size = 1000
         
         # Create vectors with known sum
@@ -313,7 +300,7 @@ def test_large_vector_operations(self):
         x = create_quad_array(x_vals)
         y = create_quad_array(y_vals)
         
-        result = dot(x, y)
+        result = np.matmul(x, y)
         expected = size * 1.0 * 2.0  # = 2000.0
         
         assert_quad_equal(result, expected)
@@ -329,7 +316,7 @@ def test_rectangular_large_matrices(self):
         A = create_quad_array(A_vals, shape=(m, k))
         B = create_quad_array(B_vals, shape=(k, n))
         
-        result = dot(A, B)
+        result = np.matmul(A, B)
         
         assert result.shape == (m, n)
         
@@ -354,7 +341,7 @@ def test_dimension_mismatch_vectors(self):
         y = create_quad_array([1, 2, 3])
         
         with pytest.raises(ValueError, match="same length"):
-            dot(x, y)
+            np.matmul(x, y)
     
     def test_dimension_mismatch_matrix_vector(self):
         """Test dimension mismatch in matrix-vector"""
@@ -362,7 +349,7 @@ def test_dimension_mismatch_matrix_vector(self):
         x = create_quad_array([1, 2, 3])  # Wrong size
         
         with pytest.raises(ValueError, match="columns must match"):
-            dot(A, x)
+            np.matmul(A, x)
     
     def test_dimension_mismatch_matrices(self):
         """Test dimension mismatch in matrix-matrix"""
@@ -370,7 +357,7 @@ def test_dimension_mismatch_matrices(self):
         B = create_quad_array([1, 2, 3, 4, 5, 6], shape=(3, 2))  # Wrong size
         
         with pytest.raises(ValueError, match="Matrix inner dimensions must match"):
-            dot(A, B)
+            np.matmul(A, B)
 
 
 if __name__ == "__main__":

From f89c2e6a1bc66433a3c3dd8c3cd0e9a101b400e2 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Fri, 18 Jul 2025 01:22:58 +0530
Subject: [PATCH 35/49] will figure out later

---
 .github/workflows/build_wheels.yml            |  2 +-
 .github/workflows/ci.yml                      |  6 +-
 .../src/quadblas_interface.cpp                | 91 -------------------
 3 files changed, 4 insertions(+), 95 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index e16e2ff..0bf55c4 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -3,7 +3,7 @@ name: Build Wheels
 on:
   push:
     branches:
-      - matmul-ufunc
+      - dot
     tags:
       - "quaddtype-v*"
     paths:
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a70e8c1..e42c12c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,7 +3,7 @@ name: Numpy User DTypes CI
 on:
   push:
     branches:
-      - matmul-ufunc
+      - dot
   pull_request:
   workflow_dispatch:
 
@@ -76,13 +76,13 @@ jobs:
           # Initialize submodules first
           git submodule update --init --recursive
           ls -la numpy_quaddtype/QBLAS/
-          
+
           # Set environment variables with proper export and correct paths
           export CFLAGS="-I/usr/local/include -I$(pwd)/numpy_quaddtype/QBLAS/include"
           export CXXFLAGS="-I/usr/local/include -I$(pwd)/numpy_quaddtype/QBLAS/include -fext-numeric-literals"
           export LDFLAGS="-L/usr/local/lib64 -L/usr/local/lib -Wl,-rpath,/usr/local/lib64 -Wl,-rpath,/usr/local/lib -fopenmp"
           export LD_LIBRARY_PATH="/usr/local/lib64:/usr/local/lib:$LD_LIBRARY_PATH"
-          
+
           # Install with meson args to ensure the C++ flags are passed through
           python -m pip install . -v --no-build-isolation \
             -Cbuilddir=build \
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index 46f953d..6bc3fb0 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -786,95 +786,4 @@ py_quadblas_get_version(PyObject *self, PyObject *args)
     return PyUnicode_FromString(QuadBLAS::VERSION);
 }
 
-// void matmul_op(Sleef_quad * inp1, Sleef_quad *inp2, Sleef_quad *out)
-// {
-//     Sleef_quad *data_a, *data_b;
-//     QuadBackendType backend_a, backend_b;
-//     QuadBLAS::Layout layout_a, layout_b;
-
-//     if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
-//         !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
-//         return nullptr;
-//     }
-
-//     Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-//     Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
-//     Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
-
-//     if (!sleef_a || !sleef_b) {
-//         QuadBLAS::aligned_free(temp_a);
-//         QuadBLAS::aligned_free(temp_b);
-//         return nullptr;
-//     }
-
-//     QuadBackendType result_backend = BACKEND_SLEEF;
-//     if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-//         result_backend = BACKEND_LONGDOUBLE;
-//     }
-
-//     npy_intp result_dims[2] = {m, n};
-//     QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
-//     if (!result_dtype) {
-//         QuadBLAS::aligned_free(temp_a);
-//         QuadBLAS::aligned_free(temp_b);
-//         return nullptr;
-//     }
-
-//     PyArrayObject *result =
-//             (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
-//     if (!result) {
-//         QuadBLAS::aligned_free(temp_a);
-//         QuadBLAS::aligned_free(temp_b);
-//         Py_DECREF(result_dtype);
-//         return nullptr;
-//     }
-
-//     Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
-//     for (npy_intp i = 0; i < m * n; i++) {
-//         result_data[i] = Sleef_cast_from_doubleq1(0.0);
-//     }
-
-//     npy_intp lda, ldb, ldc;
-
-//     if (layout_a == QuadBLAS::Layout::RowMajor) {
-//         lda = k;
-//     }
-//     else {
-//         lda = m;
-//     }
-
-//     if (layout_b == QuadBLAS::Layout::RowMajor) {
-//         ldb = n;
-//     }
-//     else {
-//         ldb = k;
-//     }
-
-//     QuadBLAS::Layout result_layout = layout_a;
-//     if (result_layout == QuadBLAS::Layout::RowMajor) {
-//         ldc = n;
-//     }
-//     else {
-//         ldc = m;
-//     }
-
-//     Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
-//     Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
-
-//     QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data,
-//                    ldc);
-
-//     if (result_backend == BACKEND_LONGDOUBLE) {
-//         long double *ld_result = (long double *)PyArray_DATA(result);
-//         for (npy_intp i = 0; i < m * n; i++) {
-//             ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
-//         }
-//     }
-
-//     QuadBLAS::aligned_free(temp_a);
-//     QuadBLAS::aligned_free(temp_b);
-
-//     return (PyObject *)result;
-// }
-
 #endif  // DISABLE_QUADBLAS
\ No newline at end of file

From 894a84db72cf4f9ec3b0367ae669102cb61eecda Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 13:46:32 +0530
Subject: [PATCH 36/49] matmul registered with naive

---
 .../numpy_quaddtype/src/umath/matmul.cpp      | 264 +++++++++++++-----
 quaddtype/numpy_quaddtype/src/umath/matmul.h  |  38 ++-
 quaddtype/tests/test_dot.py                   |   6 +-
 3 files changed, 225 insertions(+), 83 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index 01b8935..00cc858 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -5,141 +5,251 @@
 #define NO_IMPORT_ARRAY
 #define NO_IMPORT_UFUNC
 
+extern "C" {
 #include <Python.h>
 #include <cstdio>
+#include <string.h>
 
 #include "numpy/arrayobject.h"
+#include "numpy/ndarraytypes.h"
 #include "numpy/ufuncobject.h"
 #include "numpy/dtype_api.h"
-#include "numpy/ndarraytypes.h"
+}
 
 #include "../quad_common.h"
 #include "../scalar.h"
 #include "../dtype.h"
 #include "../ops.hpp"
-#include "binary_ops.h"
 #include "matmul.h"
+#include "promoters.hpp"
 
-#include <iostream>
-
+/**
+ * Resolve descriptors for matmul operation.
+ * Follows the same pattern as binary_ops.cpp
+ */
 static NPY_CASTING
 quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
                                 PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
                                 npy_intp *NPY_UNUSED(view_offset))
 {
-    NPY_CASTING casting = NPY_NO_CASTING;
-    std::cout << "exiting the descriptor";
-    return casting;
-}
+    // Follow the exact same pattern as quad_binary_op_resolve_descriptors
+    QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
+    QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
+    QuadBackendType target_backend;
 
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
-int
-quad_generic_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
-                                           npy_intp const dimensions[], npy_intp const strides[],
-                                           NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+    // Determine target backend and if casting is needed
+    NPY_CASTING casting = NPY_NO_CASTING;
+    if (descr_in1->backend != descr_in2->backend) {
+        target_backend = BACKEND_LONGDOUBLE;
+        casting = NPY_SAFE_CASTING;
+    }
+    else {
+        target_backend = descr_in1->backend;
+    }
 
-    quad_value in1, in2, out;
-    while (N--) {
-        memcpy(&in1, in1_ptr, elem_size);
-        memcpy(&in2, in2_ptr, elem_size);
-        if (backend == BACKEND_SLEEF) {
-            out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value);
+    // Set up input descriptors, casting if necessary
+    for (int i = 0; i < 2; i++) {
+        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
+            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[i]) {
+                return (NPY_CASTING)-1;
+            }
         }
         else {
-            out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value);
+            Py_INCREF(given_descrs[i]);
+            loop_descrs[i] = given_descrs[i];
         }
-        memcpy(out_ptr, &out, elem_size);
+    }
 
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
+    // Set up output descriptor
+    if (given_descrs[2] == NULL) {
+        loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+        if (!loop_descrs[2]) {
+            return (NPY_CASTING)-1;
+        }
     }
-    return 0;
+    else {
+        QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2];
+        if (descr_out->backend != target_backend) {
+            loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[2]) {
+                return (NPY_CASTING)-1;
+            }
+        }
+        else {
+            Py_INCREF(given_descrs[2]);
+            loop_descrs[2] = given_descrs[2];
+        }
+    }
+    return casting;
 }
 
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
-int
-quad_generic_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
-                                         npy_intp const dimensions[], npy_intp const strides[],
-                                         NpyAuxData *auxdata)
+/**
+ * Matrix multiplication strided loop using NumPy 2.0 API.
+ * Implements general matrix multiplication for arbitrary dimensions.
+ *
+ * For matmul with signature (m?,n),(n,p?)->(m?,p?):
+ * - dimensions[0] = N (loop dimension, number of batch operations)
+ * - dimensions[1] = m (rows of first matrix)
+ * - dimensions[2] = n (cols of first matrix / rows of second matrix)
+ * - dimensions[3] = p (cols of second matrix)
+ *
+ * - strides[0], strides[1], strides[2] = batch strides for A, B, C
+ * - strides[3], strides[4] = row stride, col stride for A (m, n)
+ * - strides[5], strides[6] = row stride, col stride for B (n, p)
+ * - strides[7], strides[8] = row stride, col stride for C (m, p)
+ */
+static int
+quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
+                         npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata)
 {
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
+    // Extract dimensions
+    npy_intp N = dimensions[0];  // Number of batch operations
+    npy_intp m = dimensions[1];  // Rows of first matrix
+    npy_intp n = dimensions[2];  // Cols of first matrix / rows of second matrix
+    npy_intp p = dimensions[3];  // Cols of second matrix
+
+    // Extract batch strides
+    npy_intp A_batch_stride = strides[0];
+    npy_intp B_batch_stride = strides[1];
+    npy_intp C_batch_stride = strides[2];
+
+    // Extract core strides for matrix dimensions
+    npy_intp A_row_stride = strides[3];  // Stride along m dimension of A
+    npy_intp A_col_stride = strides[4];  // Stride along n dimension of A
+    npy_intp B_row_stride = strides[5];  // Stride along n dimension of B
+    npy_intp B_col_stride = strides[6];  // Stride along p dimension of B
+    npy_intp C_row_stride = strides[7];  // Stride along m dimension of C
+    npy_intp C_col_stride = strides[8];  // Stride along p dimension of C
+
+    // Get backend from descriptor
     QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
     QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
 
-    while (N--) {
-        if (backend == BACKEND_SLEEF) {
-            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr);
-        }
-        else {
-            *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr);
+    // Process each batch
+    for (npy_intp batch = 0; batch < N; batch++) {
+        char *A_batch = data[0] + batch * A_batch_stride;
+        char *B_batch = data[1] + batch * B_batch_stride;
+        char *C_batch = data[2] + batch * C_batch_stride;
+
+        // Perform matrix multiplication: C = A @ B
+        // C[i,j] = sum_k(A[i,k] * B[k,j])
+        for (npy_intp i = 0; i < m; i++) {
+            for (npy_intp j = 0; j < p; j++) {
+                char *C_ij = C_batch + i * C_row_stride + j * C_col_stride;
+
+                if (backend == BACKEND_SLEEF) {
+                    Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);  // Initialize to 0
+
+                    for (npy_intp k = 0; k < n; k++) {
+                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
+                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+
+                        Sleef_quad a_val, b_val;
+                        memcpy(&a_val, A_ik, sizeof(Sleef_quad));
+                        memcpy(&b_val, B_kj, sizeof(Sleef_quad));
+
+                        // sum += A[i,k] * B[k,j]
+                        sum = Sleef_addq1_u05(sum, Sleef_mulq1_u05(a_val, b_val));
+                    }
+
+                    memcpy(C_ij, &sum, sizeof(Sleef_quad));
+                }
+                else {
+                    // Long double backend
+                    long double sum = 0.0L;
+
+                    for (npy_intp k = 0; k < n; k++) {
+                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
+                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+
+                        long double a_val, b_val;
+                        memcpy(&a_val, A_ik, sizeof(long double));
+                        memcpy(&b_val, B_kj, sizeof(long double));
+
+                        sum += a_val * b_val;
+                    }
+
+                    memcpy(C_ij, &sum, sizeof(long double));
+                }
+            }
         }
-
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
     }
+
     return 0;
 }
 
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+/**
+ * Register matmul support following the exact same pattern as binary_ops.cpp
+ */
 int
-create_matmul_ufunc(PyObject *numpy, const char *ufunc_name)
+init_matmul_ops(PyObject *numpy)
 {
-    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    printf("DEBUG: init_matmul_ops - registering matmul using NumPy 2.0 API\n");
+
+    // Get the existing matmul ufunc - same pattern as binary_ops
+    PyObject *ufunc = PyObject_GetAttrString(numpy, "matmul");
     if (ufunc == NULL) {
+        printf("DEBUG: Failed to get numpy.matmul\n");
         return -1;
     }
 
+    // Use the same pattern as binary_ops.cpp
     PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
 
-    PyType_Slot slots[] = {
-            {NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
-            {NPY_METH_strided_loop,
-             (void *)&quad_generic_matmul_strided_loop_aligned<sleef_op, longdouble_op>},
-            {NPY_METH_unaligned_strided_loop,
-             (void *)&quad_generic_matmul_strided_loop_unaligned<sleef_op, longdouble_op>},
-            {0, NULL}};
+    PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
+                           {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop},
+                           {NPY_METH_unaligned_strided_loop, (void *)&quad_matmul_strided_loop},
+                           {0, NULL}};
 
     PyArrayMethod_Spec Spec = {
             .name = "quad_matmul",
             .nin = 2,
             .nout = 1,
             .casting = NPY_NO_CASTING,
-            .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE),
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
             .dtypes = dtypes,
             .slots = slots,
     };
 
+    printf("DEBUG: About to add loop to matmul ufunc...\n");
+
     if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        printf("DEBUG: Failed to add loop to matmul ufunc\n");
+        Py_DECREF(ufunc);
         return -1;
     }
-    // my guess we don't need any promoter here as of now, since matmul is quad specific
-    return 0;
-}
 
-int
-init_matmul_ops(PyObject *numpy)
-{
-    if (create_matmul_ufunc<quad_add, ld_add>(numpy, "matmul") < 0) {
+    printf("DEBUG: Successfully added matmul loop!\n");
+
+    // Add promoter following binary_ops pattern
+    PyObject *promoter_capsule =
+            PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL);
+    if (promoter_capsule == NULL) {
+        Py_DECREF(ufunc);
+        return -1;
+    }
+
+    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type);
+    if (DTypes == NULL) {
+        Py_DECREF(promoter_capsule);
+        Py_DECREF(ufunc);
         return -1;
     }
+
+    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
+        printf("DEBUG: Failed to add promoter (continuing anyway)\n");
+        PyErr_Clear();  // Don't fail if promoter fails
+    }
+    else {
+        printf("DEBUG: Successfully added promoter\n");
+    }
+
+    Py_DECREF(DTypes);
+    Py_DECREF(promoter_capsule);
+    Py_DECREF(ufunc);
+
+    printf("DEBUG: init_matmul_ops completed successfully\n");
     return 0;
-}
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.h b/quaddtype/numpy_quaddtype/src/umath/matmul.h
index bc099eb..947e2c3 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.h
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.h
@@ -1,8 +1,40 @@
-#ifndef _QUADDTYPE_MATMUL_OPS_H
-#define _QUADDTYPE_MATMUL_OPS_H
+#ifndef _QUADDTYPE_MATMUL_H
+#define _QUADDTYPE_MATMUL_H
+
+/**
+ * Quad Precision Matrix Multiplication for NumPy
+ *
+ * This module implements matrix multiplication functionality for the QuadPrecDType
+ * by registering custom loops with numpy's matmul generalized ufunc.
+ *
+ * Supports all matmul operation types:
+ * - Vector-vector (dot product): (n,) @ (n,) -> scalar
+ * - Matrix-vector: (m,n) @ (n,) -> (m,)
+ * - Vector-matrix: (n,) @ (n,p) -> (p,)
+ * - Matrix-matrix: (m,n) @ (n,p) -> (m,p)
+ *
+ * Uses naive algorithms optimized for correctness rather than performance.
+ * For production use, consider integration with QBLAS optimized routines.
+ */
 
 #include <Python.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Initialize the matmul operations for the quad precision dtype.
+ * This function registers the matmul generalized ufunc with numpy.
+ *
+ * @param numpy The numpy module object
+ * @return 0 on success, -1 on failure
+ */
 int
 init_matmul_ops(PyObject *numpy);
-#endif
\ No newline at end of file
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // _QUADDTYPE_MATMUL_H
\ No newline at end of file
diff --git a/quaddtype/tests/test_dot.py b/quaddtype/tests/test_dot.py
index f3fa3f6..9256f3d 100644
--- a/quaddtype/tests/test_dot.py
+++ b/quaddtype/tests/test_dot.py
@@ -340,7 +340,7 @@ def test_dimension_mismatch_vectors(self):
         x = create_quad_array([1, 2])
         y = create_quad_array([1, 2, 3])
         
-        with pytest.raises(ValueError, match="same length"):
+        with pytest.raises(ValueError, match=r"matmul: Input operand 1 has a mismatch in its core dimension 0"):
             np.matmul(x, y)
     
     def test_dimension_mismatch_matrix_vector(self):
@@ -348,7 +348,7 @@ def test_dimension_mismatch_matrix_vector(self):
         A = create_quad_array([1, 2, 3, 4], shape=(2, 2))
         x = create_quad_array([1, 2, 3])  # Wrong size
         
-        with pytest.raises(ValueError, match="columns must match"):
+        with pytest.raises(ValueError, match=r"matmul: Input operand 1 has a mismatch in its core dimension 0"):
             np.matmul(A, x)
     
     def test_dimension_mismatch_matrices(self):
@@ -356,7 +356,7 @@ def test_dimension_mismatch_matrices(self):
         A = create_quad_array([1, 2, 3, 4], shape=(2, 2))
         B = create_quad_array([1, 2, 3, 4, 5, 6], shape=(3, 2))  # Wrong size
         
-        with pytest.raises(ValueError, match="Matrix inner dimensions must match"):
+        with pytest.raises(ValueError, match=r"matmul: Input operand 1 has a mismatch in its core dimension 0"):
             np.matmul(A, B)
 
 

From 6800a906b1fbcdd363f02482c5b6f225463cc456 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 16:29:09 +0530
Subject: [PATCH 37/49] adding initial qblas support to matmul ufunc, something
 is breaking, nan

---
 .../src/quadblas_interface.cpp                | 806 ++----------------
 .../numpy_quaddtype/src/quadblas_interface.h  |  34 +-
 .../numpy_quaddtype/src/umath/matmul.cpp      | 263 +++---
 3 files changed, 257 insertions(+), 846 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index 6bc3fb0..185e2d8 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -1,773 +1,126 @@
-#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
-#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
-#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
-#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
-#define NO_IMPORT_ARRAY
-#define NO_IMPORT_UFUNC
-
-extern "C" {
-#include <Python.h>
-#include "numpy/arrayobject.h"
-#include "numpy/ndarraytypes.h"
-#include "numpy/dtype_api.h"
-}
-
-#include "scalar.h"
-#include "dtype.h"
-#include "quad_common.h"
 #include "quadblas_interface.h"
-
-extern "C" {
-#include <sleef.h>
-#include <sleefquad.h>
-}
-
-#ifndef DISABLE_QUADBLAS
 #include "../QBLAS/include/quadblas/quadblas.hpp"
-#endif
-
-#ifdef DISABLE_QUADBLAS
-
-static bool
-extract_quad_array_info_simple(PyArrayObject *arr, Sleef_quad **data, QuadBackendType *backend)
-{
-    if (!PyArray_Check(arr)) {
-        PyErr_SetString(PyExc_TypeError, "Expected numpy array");
-        return false;
-    }
-
-    PyArray_Descr *descr = PyArray_DESCR(arr);
-    if (!PyObject_TypeCheck(descr, (PyTypeObject *)&QuadPrecDType)) {
-        PyErr_SetString(PyExc_TypeError, "Array must have QuadPrecDType dtype");
-        return false;
-    }
-
-    QuadPrecDTypeObject *quad_descr = (QuadPrecDTypeObject *)descr;
-    *backend = quad_descr->backend;
-    *data = (Sleef_quad *)PyArray_DATA(arr);
-
-    return true;
-}
-
-static Sleef_quad *
-ensure_sleef_backend_simple(PyArrayObject *arr, QuadBackendType original_backend,
-                            Sleef_quad **temp_storage)
-{
-    if (original_backend == BACKEND_SLEEF) {
-        *temp_storage = nullptr;
-        return (Sleef_quad *)PyArray_DATA(arr);
-    }
-
-    npy_intp size = PyArray_SIZE(arr);
-    *temp_storage = (Sleef_quad *)malloc(size * sizeof(Sleef_quad));
-    if (!*temp_storage) {
-        PyErr_NoMemory();
-        return nullptr;
-    }
-
-    long double *ld_data = (long double *)PyArray_DATA(arr);
-    for (npy_intp i = 0; i < size; i++) {
-        (*temp_storage)[i] = Sleef_cast_from_doubleq1((double)ld_data[i]);
-    }
+#include <cstring>
+#include <algorithm>
 
-    return *temp_storage;
-}
-
-// ===============================================================================
-// FALLBACK IMPLEMENTATIONS (No QuadBLAS)
-// ===============================================================================
+extern "C" {
 
-static PyObject *
-dot_vector_vector_fallback(PyArrayObject *a, PyArrayObject *b)
+int
+qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result)
 {
-    if (PyArray_NDIM(a) != 1 || PyArray_NDIM(b) != 1) {
-        PyErr_SetString(PyExc_ValueError, "Both inputs must be 1-dimensional arrays");
-        return nullptr;
-    }
-
-    npy_intp n_a = PyArray_DIM(a, 0);
-    npy_intp n_b = PyArray_DIM(b, 0);
-
-    if (n_a != n_b) {
-        PyErr_SetString(PyExc_ValueError, "Arrays must have the same length");
-        return nullptr;
-    }
-
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-
-    if (!extract_quad_array_info_simple(a, &data_a, &backend_a) ||
-        !extract_quad_array_info_simple(b, &data_b, &backend_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend_simple(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend_simple(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        free(temp_a);
-        free(temp_b);
-        return nullptr;
-    }
-
-    // Simple dot product implementation
-    Sleef_quad result = Sleef_cast_from_doubleq1(0.0);
-    for (npy_intp i = 0; i < n_a; i++) {
-        result = Sleef_fmaq1_u05(sleef_a[i], sleef_b[i], result);
-    }
-
-    free(temp_a);
-    free(temp_b);
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    QuadPrecisionObject *result_obj = QuadPrecision_raw_new(result_backend);
-    if (!result_obj) {
-        return nullptr;
+    if (!x || !y || !result || n == 0) {
+        return -1;
     }
 
-    if (result_backend == BACKEND_SLEEF) {
-        result_obj->value.sleef_value = result;
+    try {
+        *result = QuadBLAS::dot(n, x, incx, y, incy);
+        return 0;
     }
-    else {
-        result_obj->value.longdouble_value = (long double)Sleef_cast_to_doubleq1(result);
+    catch (...) {
+        return -1;
     }
-
-    return (PyObject *)result_obj;
 }
 
-static PyObject *
-dot_matrix_vector_fallback(PyArrayObject *a, PyArrayObject *b)
+int
+qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A,
+           size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy)
 {
-    if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 1) {
-        PyErr_SetString(PyExc_ValueError, "First input must be 2D, second input must be 1D");
-        return nullptr;
-    }
-
-    npy_intp m = PyArray_DIM(a, 0);
-    npy_intp n = PyArray_DIM(a, 1);
-    npy_intp n_b = PyArray_DIM(b, 0);
-
-    if (n != n_b) {
-        PyErr_SetString(PyExc_ValueError, "Matrix columns must match vector length");
-        return nullptr;
-    }
-
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-
-    if (!extract_quad_array_info_simple(a, &data_a, &backend_a) ||
-        !extract_quad_array_info_simple(b, &data_b, &backend_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend_simple(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend_simple(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        free(temp_a);
-        free(temp_b);
-        return nullptr;
-    }
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    npy_intp result_dims[1] = {m};
-    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
-    if (!result_dtype) {
-        free(temp_a);
-        free(temp_b);
-        return nullptr;
-    }
-
-    PyArrayObject *result =
-            (PyArrayObject *)PyArray_Empty(1, result_dims, (PyArray_Descr *)result_dtype, 0);
-    if (!result) {
-        free(temp_a);
-        free(temp_b);
-        Py_DECREF(result_dtype);
-        return nullptr;
-    }
-
-    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
-
-    // Initialize result to zero
-    for (npy_intp i = 0; i < m; i++) {
-        result_data[i] = Sleef_cast_from_doubleq1(0.0);
+    if (!alpha || !A || !x || !beta || !y || m == 0 || n == 0) {
+        return -1;
     }
 
-    // Simple matrix-vector multiplication: result[i] = sum(A[i,j] * b[j])
-    for (npy_intp i = 0; i < m; i++) {
-        Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);
-        for (npy_intp j = 0; j < n; j++) {
-            // Assume row-major layout: A[i,j] = sleef_a[i*n + j]
-            sum = Sleef_fmaq1_u05(sleef_a[i * n + j], sleef_b[j], sum);
+    try {
+        // Convert layout
+        QuadBLAS::Layout qblas_layout;
+        if (layout == 'R' || layout == 'r') {
+            qblas_layout = QuadBLAS::Layout::RowMajor;
         }
-        result_data[i] = sum;
-    }
-
-    // Convert to longdouble if needed
-    if (result_backend == BACKEND_LONGDOUBLE) {
-        long double *ld_result = (long double *)PyArray_DATA(result);
-        for (npy_intp i = 0; i < m; i++) {
-            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+        else if (layout == 'C' || layout == 'c') {
+            qblas_layout = QuadBLAS::Layout::ColMajor;
         }
-    }
-
-    free(temp_a);
-    free(temp_b);
-
-    return (PyObject *)result;
-}
-
-static PyObject *
-dot_matrix_matrix_fallback(PyArrayObject *a, PyArrayObject *b)
-{
-    if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 2) {
-        PyErr_SetString(PyExc_ValueError, "Both inputs must be 2-dimensional arrays");
-        return nullptr;
-    }
-
-    npy_intp m = PyArray_DIM(a, 0);
-    npy_intp k = PyArray_DIM(a, 1);
-    npy_intp k_b = PyArray_DIM(b, 0);
-    npy_intp n = PyArray_DIM(b, 1);
-
-    if (k != k_b) {
-        PyErr_SetString(PyExc_ValueError, "Matrix inner dimensions must match");
-        return nullptr;
-    }
-
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-
-    if (!extract_quad_array_info_simple(a, &data_a, &backend_a) ||
-        !extract_quad_array_info_simple(b, &data_b, &backend_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend_simple(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend_simple(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        free(temp_a);
-        free(temp_b);
-        return nullptr;
-    }
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    npy_intp result_dims[2] = {m, n};
-    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
-    if (!result_dtype) {
-        free(temp_a);
-        free(temp_b);
-        return nullptr;
-    }
-
-    PyArrayObject *result =
-            (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
-    if (!result) {
-        free(temp_a);
-        free(temp_b);
-        Py_DECREF(result_dtype);
-        return nullptr;
-    }
-
-    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
-
-    // Initialize result matrix to zero
-    for (npy_intp i = 0; i < m * n; i++) {
-        result_data[i] = Sleef_cast_from_doubleq1(0.0);
-    }
-
-    // Simple matrix-matrix multiplication: C[i,j] = sum(A[i,l] * B[l,j])
-    for (npy_intp i = 0; i < m; i++) {
-        for (npy_intp j = 0; j < n; j++) {
-            Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);
-            for (npy_intp l = 0; l < k; l++) {
-                // Row-major: A[i,l] = sleef_a[i*k + l], B[l,j] = sleef_b[l*n + j]
-                sum = Sleef_fmaq1_u05(sleef_a[i * k + l], sleef_b[l * n + j], sum);
-            }
-            result_data[i * n + j] = sum;
+        else {
+            return -1;  // Invalid layout
         }
-    }
 
-    // Convert to longdouble if needed
-    if (result_backend == BACKEND_LONGDOUBLE) {
-        long double *ld_result = (long double *)PyArray_DATA(result);
-        for (npy_intp i = 0; i < m * n; i++) {
-            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+        // Handle transpose (swap dimensions for transpose)
+        size_t actual_m = m, actual_n = n;
+        if (trans == 'T' || trans == 't' || trans == 'C' || trans == 'c') {
+            std::swap(actual_m, actual_n);
+            // For transpose, we need to adjust the layout
+            if (qblas_layout == QuadBLAS::Layout::RowMajor) {
+                qblas_layout = QuadBLAS::Layout::ColMajor;
+            }
+            else {
+                qblas_layout = QuadBLAS::Layout::RowMajor;
+            }
         }
-    }
 
-    free(temp_a);
-    free(temp_b);
+        // Call QBLAS GEMV
+        QuadBLAS::gemv(qblas_layout, actual_m, actual_n, *alpha, A, lda, x, incx, *beta, y, incy);
 
-    return (PyObject *)result;
-}
-
-PyObject *
-py_quadblas_dot(PyObject *self, PyObject *args)
-{
-    PyObject *a_obj, *b_obj;
-
-    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
-        return nullptr;
-    }
-
-    PyArrayObject *a = (PyArrayObject *)PyArray_FROM_OF(a_obj, NPY_ARRAY_ALIGNED);
-    PyArrayObject *b = (PyArrayObject *)PyArray_FROM_OF(b_obj, NPY_ARRAY_ALIGNED);
-
-    if (!a || !b) {
-        Py_XDECREF(a);
-        Py_XDECREF(b);
-        PyErr_SetString(PyExc_TypeError, "Inputs must be convertible to arrays");
-        return nullptr;
-    }
-
-    PyObject *result = nullptr;
-
-    int ndim_a = PyArray_NDIM(a);
-    int ndim_b = PyArray_NDIM(b);
-
-    if (ndim_a == 1 && ndim_b == 1) {
-        result = dot_vector_vector_fallback(a, b);
-    }
-    else if (ndim_a == 2 && ndim_b == 1) {
-        result = dot_matrix_vector_fallback(a, b);
+        return 0;
     }
-    else if (ndim_a == 2 && ndim_b == 2) {
-        result = dot_matrix_matrix_fallback(a, b);
+    catch (...) {
+        return -1;
     }
-    else if (ndim_a == 1 && ndim_b == 2) {
-        PyErr_SetString(PyExc_ValueError,
-                        "Vector-Matrix multiplication not supported (use Matrix-Vector instead)");
-    }
-    else {
-        PyErr_SetString(PyExc_ValueError,
-                        "Unsupported array dimensions. Supported: (1D,1D), (2D,1D), (2D,2D)");
-    }
-
-    Py_DECREF(a);
-    Py_DECREF(b);
-
-    return result;
 }
 
-// Dummy implementations for other QuadBLAS functions
-PyObject *
-py_quadblas_set_num_threads(PyObject *self, PyObject *args)
-{
-    // On Windows fallback, just ignore thread setting
-    Py_RETURN_NONE;
-}
-
-PyObject *
-py_quadblas_get_num_threads(PyObject *self, PyObject *args)
+int
+qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha,
+           Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C,
+           size_t ldc)
 {
-    // Return 1 for fallback implementation
-    return PyLong_FromLong(1);
-}
-
-PyObject *
-py_quadblas_get_version(PyObject *self, PyObject *args)
-{
-    return PyUnicode_FromString("QuadBLAS is disabled for MSVC");
-}
-
-#else
-
-static QuadBLAS::Layout
-get_quadblas_layout(PyArrayObject *arr)
-{
-    if (PyArray_IS_C_CONTIGUOUS(arr)) {
-        return QuadBLAS::Layout::RowMajor;
-    }
-    else {
-        return QuadBLAS::Layout::ColMajor;
-    }
-}
-
-static bool
-extract_quad_array_info(PyArrayObject *arr, Sleef_quad **data, QuadBackendType *backend,
-                        QuadBLAS::Layout *layout)
-{
-    if (!PyArray_Check(arr)) {
-        PyErr_SetString(PyExc_TypeError, "Expected numpy array");
-        return false;
-    }
-
-    PyArray_Descr *descr = PyArray_DESCR(arr);
-    if (!PyObject_TypeCheck(descr, (PyTypeObject *)&QuadPrecDType)) {
-        PyErr_SetString(PyExc_TypeError, "Array must have QuadPrecDType dtype");
-        return false;
-    }
-
-    QuadPrecDTypeObject *quad_descr = (QuadPrecDTypeObject *)descr;
-    *backend = quad_descr->backend;
-    *data = (Sleef_quad *)PyArray_DATA(arr);
-    *layout = get_quadblas_layout(arr);
-
-    return true;
-}
-
-static Sleef_quad *
-ensure_sleef_backend(PyArrayObject *arr, QuadBackendType original_backend,
-                     Sleef_quad **temp_storage)
-{
-    if (original_backend == BACKEND_SLEEF) {
-        *temp_storage = nullptr;
-        return (Sleef_quad *)PyArray_DATA(arr);
-    }
-
-    npy_intp size = PyArray_SIZE(arr);
-    *temp_storage = QuadBLAS::aligned_alloc<Sleef_quad>(size);
-    if (!*temp_storage) {
-        PyErr_NoMemory();
-        return nullptr;
-    }
-
-    long double *ld_data = (long double *)PyArray_DATA(arr);
-    for (npy_intp i = 0; i < size; i++) {
-        (*temp_storage)[i] = Sleef_cast_from_doubleq1((double)ld_data[i]);
-    }
-
-    return *temp_storage;
-}
-
-static PyObject *
-dot_vector_vector(PyArrayObject *a, PyArrayObject *b)
-{
-    if (PyArray_NDIM(a) != 1 || PyArray_NDIM(b) != 1) {
-        PyErr_SetString(PyExc_ValueError, "Both inputs must be 1-dimensional arrays");
-        return nullptr;
-    }
-
-    npy_intp n_a = PyArray_DIM(a, 0);
-    npy_intp n_b = PyArray_DIM(b, 0);
-
-    if (n_a != n_b) {
-        PyErr_SetString(PyExc_ValueError, "Arrays must have the same length");
-        return nullptr;
-    }
-
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-    QuadBLAS::Layout layout_a, layout_b;
-
-    if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
-        !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    npy_intp stride_a = PyArray_STRIDE(a, 0) / PyArray_ITEMSIZE(a);
-    npy_intp stride_b = PyArray_STRIDE(b, 0) / PyArray_ITEMSIZE(b);
-
-    Sleef_quad result = QuadBLAS::dot(n_a, sleef_a, stride_a, sleef_b, stride_b);
-
-    QuadBLAS::aligned_free(temp_a);
-    QuadBLAS::aligned_free(temp_b);
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    QuadPrecisionObject *result_obj = QuadPrecision_raw_new(result_backend);
-    if (!result_obj) {
-        return nullptr;
-    }
-
-    if (result_backend == BACKEND_SLEEF) {
-        result_obj->value.sleef_value = result;
-    }
-    else {
-        result_obj->value.longdouble_value = (long double)Sleef_cast_to_doubleq1(result);
-    }
-
-    return (PyObject *)result_obj;
-}
-
-static PyObject *
-dot_matrix_vector(PyArrayObject *a, PyArrayObject *b)
-{
-    if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 1) {
-        PyErr_SetString(PyExc_ValueError, "First input must be 2D, second input must be 1D");
-        return nullptr;
-    }
-
-    npy_intp m = PyArray_DIM(a, 0);
-    npy_intp n = PyArray_DIM(a, 1);
-    npy_intp n_b = PyArray_DIM(b, 0);
-
-    if (n != n_b) {
-        PyErr_SetString(PyExc_ValueError, "Matrix columns must match vector length");
-        return nullptr;
-    }
-
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-    QuadBLAS::Layout layout_a, layout_b;
-
-    if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
-        !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    npy_intp result_dims[1] = {m};
-    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
-    if (!result_dtype) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    PyArrayObject *result =
-            (PyArrayObject *)PyArray_Empty(1, result_dims, (PyArray_Descr *)result_dtype, 0);
-    if (!result) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        Py_DECREF(result_dtype);
-        return nullptr;
+    if (!alpha || !A || !B || !beta || !C || m == 0 || n == 0 || k == 0) {
+        return -1;
     }
 
-    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
-
-    npy_intp lda;
-    if (layout_a == QuadBLAS::Layout::RowMajor) {
-        lda = n;
-    }
-    else {
-        lda = m;
-    }
-
-    npy_intp stride_b = PyArray_STRIDE(b, 0) / PyArray_ITEMSIZE(b);
-    npy_intp stride_result = PyArray_STRIDE(result, 0) / PyArray_ITEMSIZE(result);
-
-    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
-    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
-
-    QuadBLAS::gemv(layout_a, m, n, alpha, sleef_a, lda, sleef_b, stride_b, beta, result_data,
-                   stride_result);
-
-    if (result_backend == BACKEND_LONGDOUBLE) {
-        long double *ld_result = (long double *)PyArray_DATA(result);
-        for (npy_intp i = 0; i < m; i++) {
-            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
+    try {
+        // Convert layout
+        QuadBLAS::Layout qblas_layout;
+        if (layout == 'R' || layout == 'r') {
+            qblas_layout = QuadBLAS::Layout::RowMajor;
+        }
+        else if (layout == 'C' || layout == 'c') {
+            qblas_layout = QuadBLAS::Layout::ColMajor;
+        }
+        else {
+            return -1;  // Invalid layout
         }
-    }
-
-    QuadBLAS::aligned_free(temp_a);
-    QuadBLAS::aligned_free(temp_b);
-
-    return (PyObject *)result;
-}
-
-static PyObject *
-dot_matrix_matrix(PyArrayObject *a, PyArrayObject *b)
-{
-    if (PyArray_NDIM(a) != 2 || PyArray_NDIM(b) != 2) {
-        PyErr_SetString(PyExc_ValueError, "Both inputs must be 2-dimensional arrays");
-        return nullptr;
-    }
-
-    npy_intp m = PyArray_DIM(a, 0);
-    npy_intp k = PyArray_DIM(a, 1);
-    npy_intp k_b = PyArray_DIM(b, 0);
-    npy_intp n = PyArray_DIM(b, 1);
-
-    if (k != k_b) {
-        PyErr_SetString(PyExc_ValueError, "Matrix inner dimensions must match");
-        return nullptr;
-    }
-
-    Sleef_quad *data_a, *data_b;
-    QuadBackendType backend_a, backend_b;
-    QuadBLAS::Layout layout_a, layout_b;
-
-    if (!extract_quad_array_info(a, &data_a, &backend_a, &layout_a) ||
-        !extract_quad_array_info(b, &data_b, &backend_b, &layout_b)) {
-        return nullptr;
-    }
-
-    Sleef_quad *temp_a = nullptr, *temp_b = nullptr;
-    Sleef_quad *sleef_a = ensure_sleef_backend(a, backend_a, &temp_a);
-    Sleef_quad *sleef_b = ensure_sleef_backend(b, backend_b, &temp_b);
-
-    if (!sleef_a || !sleef_b) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    QuadBackendType result_backend = BACKEND_SLEEF;
-    if (backend_a == BACKEND_LONGDOUBLE && backend_b == BACKEND_LONGDOUBLE) {
-        result_backend = BACKEND_LONGDOUBLE;
-    }
-
-    npy_intp result_dims[2] = {m, n};
-    QuadPrecDTypeObject *result_dtype = new_quaddtype_instance(result_backend);
-    if (!result_dtype) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        return nullptr;
-    }
-
-    PyArrayObject *result =
-            (PyArrayObject *)PyArray_Empty(2, result_dims, (PyArray_Descr *)result_dtype, 0);
-    if (!result) {
-        QuadBLAS::aligned_free(temp_a);
-        QuadBLAS::aligned_free(temp_b);
-        Py_DECREF(result_dtype);
-        return nullptr;
-    }
-
-    Sleef_quad *result_data = (Sleef_quad *)PyArray_DATA(result);
-    for (npy_intp i = 0; i < m * n; i++) {
-        result_data[i] = Sleef_cast_from_doubleq1(0.0);
-    }
-
-    npy_intp lda, ldb, ldc;
 
-    if (layout_a == QuadBLAS::Layout::RowMajor) {
-        lda = k;
-    }
-    else {
-        lda = m;
-    }
+        // For now, we only support no transpose
+        // TODO: Implement transpose support if needed
+        if ((transa != 'N' && transa != 'n') || (transb != 'N' && transb != 'n')) {
+            return -1;  // Transpose not implemented yet
+        }
 
-    if (layout_b == QuadBLAS::Layout::RowMajor) {
-        ldb = n;
-    }
-    else {
-        ldb = k;
-    }
+        // Call QBLAS GEMM
+        QuadBLAS::gemm(qblas_layout, m, n, k, *alpha, A, lda, B, ldb, *beta, C, ldc);
 
-    QuadBLAS::Layout result_layout = layout_a;
-    if (result_layout == QuadBLAS::Layout::RowMajor) {
-        ldc = n;
-    }
-    else {
-        ldc = m;
+        return 0;
     }
-
-    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
-    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
-
-    QuadBLAS::gemm(result_layout, m, n, k, alpha, sleef_a, lda, sleef_b, ldb, beta, result_data,
-                   ldc);
-
-    if (result_backend == BACKEND_LONGDOUBLE) {
-        long double *ld_result = (long double *)PyArray_DATA(result);
-        for (npy_intp i = 0; i < m * n; i++) {
-            ld_result[i] = (long double)Sleef_cast_to_doubleq1(result_data[i]);
-        }
+    catch (...) {
+        return -1;
     }
-
-    QuadBLAS::aligned_free(temp_a);
-    QuadBLAS::aligned_free(temp_b);
-
-    return (PyObject *)result;
 }
 
-PyObject *
-py_quadblas_dot(PyObject *self, PyObject *args)
+int
+qblas_supports_backend(QuadBackendType backend)
 {
-    PyObject *a_obj, *b_obj;
-
-    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
-        return nullptr;
-    }
-
-    PyArrayObject *a = (PyArrayObject *)PyArray_FROM_OF(a_obj, NPY_ARRAY_ALIGNED);
-    PyArrayObject *b = (PyArrayObject *)PyArray_FROM_OF(b_obj, NPY_ARRAY_ALIGNED);
-
-    if (!a || !b) {
-        Py_XDECREF(a);
-        Py_XDECREF(b);
-        PyErr_SetString(PyExc_TypeError, "Inputs must be convertible to arrays");
-        return nullptr;
-    }
-
-    PyObject *result = nullptr;
-
-    int ndim_a = PyArray_NDIM(a);
-    int ndim_b = PyArray_NDIM(b);
-
-    if (ndim_a == 1 && ndim_b == 1) {
-        result = dot_vector_vector(a, b);
-    }
-    else if (ndim_a == 2 && ndim_b == 1) {
-        result = dot_matrix_vector(a, b);
-    }
-    else if (ndim_a == 2 && ndim_b == 2) {
-        result = dot_matrix_matrix(a, b);
-    }
-    else if (ndim_a == 1 && ndim_b == 2) {
-        PyErr_SetString(PyExc_ValueError,
-                        "Vector-Matrix multiplication not supported (use Matrix-Vector instead)");
-    }
-    else {
-        PyErr_SetString(PyExc_ValueError,
-                        "Unsupported array dimensions. Supported: (1D,1D), (2D,1D), (2D,2D)");
-    }
-
-    Py_DECREF(a);
-    Py_DECREF(b);
-
-    return result;
+    // QBLAS only supports SLEEF backend
+    return (backend == BACKEND_SLEEF) ? 1 : 0;
 }
 
 PyObject *
 py_quadblas_set_num_threads(PyObject *self, PyObject *args)
 {
     int num_threads;
-
     if (!PyArg_ParseTuple(args, "i", &num_threads)) {
-        return nullptr;
+        return NULL;
     }
 
-    if (num_threads < 1) {
+    if (num_threads <= 0) {
         PyErr_SetString(PyExc_ValueError, "Number of threads must be positive");
-        return nullptr;
+        return NULL;
     }
 
     QuadBLAS::set_num_threads(num_threads);
@@ -777,13 +130,14 @@ py_quadblas_set_num_threads(PyObject *self, PyObject *args)
 PyObject *
 py_quadblas_get_num_threads(PyObject *self, PyObject *args)
 {
-    return PyLong_FromLong(QuadBLAS::get_num_threads());
+    int num_threads = QuadBLAS::get_num_threads();
+    return PyLong_FromLong(num_threads);
 }
 
 PyObject *
 py_quadblas_get_version(PyObject *self, PyObject *args)
 {
-    return PyUnicode_FromString(QuadBLAS::VERSION);
+    return PyUnicode_FromString("QuadBLAS 1.0.0 - High Performance Quad Precision BLAS");
 }
 
-#endif  // DISABLE_QUADBLAS
\ No newline at end of file
+}  // extern "C"
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.h b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
index da8f0a8..ff9ed53 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.h
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
@@ -1,23 +1,39 @@
-#ifndef _QUADDTYPE_QUADBLAS_INTERFACE_H
-#define _QUADDTYPE_QUADBLAS_INTERFACE_H
+#ifndef QUADBLAS_INTERFACE_H
+#define QUADBLAS_INTERFACE_H
+
+#include <stddef.h>
+#include <Python.h>
+#include "quad_common.h"
+#include <sleefquad.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include <Python.h>
-
+int
+qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result);
 
-PyObject* py_quadblas_dot(PyObject* self, PyObject* args);
+int
+qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A,
+           size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy);
 
+int
+qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha,
+           Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C,
+           size_t ldc);
 
-PyObject* py_quadblas_set_num_threads(PyObject* self, PyObject* args);
-PyObject* py_quadblas_get_num_threads(PyObject* self, PyObject* args);
+int
+qblas_supports_backend(QuadBackendType backend);
 
-PyObject* py_quadblas_get_version(PyObject* self, PyObject* args);
+PyObject *
+py_quadblas_set_num_threads(PyObject *self, PyObject *args);
+PyObject *
+py_quadblas_get_num_threads(PyObject *self, PyObject *args);
+PyObject *
+py_quadblas_get_version(PyObject *self, PyObject *args);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif 
\ No newline at end of file
+#endif  // QUADBLAS_INTERFACE_H
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index 00cc858..d1adc7f 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -22,43 +22,36 @@ extern "C" {
 #include "../ops.hpp"
 #include "matmul.h"
 #include "promoters.hpp"
+#include "../quadblas_interface.h"
 
 /**
  * Resolve descriptors for matmul operation.
- * Follows the same pattern as binary_ops.cpp
+ * Only supports SLEEF backend when QBLAS is enabled.
  */
 static NPY_CASTING
 quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
                                 PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
                                 npy_intp *NPY_UNUSED(view_offset))
 {
-    // Follow the exact same pattern as quad_binary_op_resolve_descriptors
     QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
     QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
-    QuadBackendType target_backend;
 
-    // Determine target backend and if casting is needed
-    NPY_CASTING casting = NPY_NO_CASTING;
-    if (descr_in1->backend != descr_in2->backend) {
-        target_backend = BACKEND_LONGDOUBLE;
-        casting = NPY_SAFE_CASTING;
-    }
-    else {
-        target_backend = descr_in1->backend;
+    // QBLAS only supports SLEEF backend
+    if (descr_in1->backend != BACKEND_SLEEF || descr_in2->backend != BACKEND_SLEEF) {
+        PyErr_SetString(PyExc_NotImplementedError,
+                        "QBLAS-accelerated matmul only supports SLEEF backend. "
+                        "Other backends are not supported with QBLAS.");
+        return (NPY_CASTING)-1;
     }
 
-    // Set up input descriptors, casting if necessary
+    // Both inputs must use SLEEF backend
+    QuadBackendType target_backend = BACKEND_SLEEF;
+    NPY_CASTING casting = NPY_NO_CASTING;
+
+    // Set up input descriptors
     for (int i = 0; i < 2; i++) {
-        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
-            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
-            if (!loop_descrs[i]) {
-                return (NPY_CASTING)-1;
-            }
-        }
-        else {
-            Py_INCREF(given_descrs[i]);
-            loop_descrs[i] = given_descrs[i];
-        }
+        Py_INCREF(given_descrs[i]);
+        loop_descrs[i] = given_descrs[i];
     }
 
     // Set up output descriptor
@@ -71,10 +64,9 @@ quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[
     else {
         QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2];
         if (descr_out->backend != target_backend) {
-            loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
-            if (!loop_descrs[2]) {
-                return (NPY_CASTING)-1;
-            }
+            PyErr_SetString(PyExc_NotImplementedError,
+                            "QBLAS-accelerated matmul only supports SLEEF backend for output.");
+            return (NPY_CASTING)-1;
         }
         else {
             Py_INCREF(given_descrs[2]);
@@ -85,117 +77,166 @@ quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[
 }
 
 /**
- * Matrix multiplication strided loop using NumPy 2.0 API.
- * Implements general matrix multiplication for arbitrary dimensions.
- *
- * For matmul with signature (m?,n),(n,p?)->(m?,p?):
- * - dimensions[0] = N (loop dimension, number of batch operations)
- * - dimensions[1] = m (rows of first matrix)
- * - dimensions[2] = n (cols of first matrix / rows of second matrix)
- * - dimensions[3] = p (cols of second matrix)
- *
- * - strides[0], strides[1], strides[2] = batch strides for A, B, C
- * - strides[3], strides[4] = row stride, col stride for A (m, n)
- * - strides[5], strides[6] = row stride, col stride for B (n, p)
- * - strides[7], strides[8] = row stride, col stride for C (m, p)
+ * Determine the type of operation based on input dimensions
+ */
+enum MatmulOperationType {
+    MATMUL_DOT,   // 1D x 1D -> scalar
+    MATMUL_GEMV,  // 2D x 1D -> 1D
+    MATMUL_GEMM   // 2D x 2D -> 2D
+};
+
+static MatmulOperationType
+determine_operation_type(npy_intp m, npy_intp n, npy_intp p)
+{
+    // For matmul signature (m?,n),(n,p?)->(m?,p?):
+    // - If m=1 and p=1: vector dot product (1D x 1D)
+    // - If p=1: matrix-vector multiplication (2D x 1D)
+    // - Otherwise: matrix-matrix multiplication (2D x 2D)
+
+    if (m == 1 && p == 1) {
+        return MATMUL_DOT;
+    }
+    else if (p == 1) {
+        return MATMUL_GEMV;
+    }
+    else {
+        return MATMUL_GEMM;
+    }
+}
+
+/**
+ * Matrix multiplication strided loop using QBLAS.
+ * Automatically selects the appropriate QBLAS operation based on input dimensions.
  */
 static int
 quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
                          npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata)
 {
     // Extract dimensions
-    npy_intp N = dimensions[0];  // Number of batch operations
+    npy_intp N = dimensions[0];  // Batch size, this remains always 1 for matmul afaik
     npy_intp m = dimensions[1];  // Rows of first matrix
     npy_intp n = dimensions[2];  // Cols of first matrix / rows of second matrix
     npy_intp p = dimensions[3];  // Cols of second matrix
 
     // Extract batch strides
-    npy_intp A_batch_stride = strides[0];
-    npy_intp B_batch_stride = strides[1];
-    npy_intp C_batch_stride = strides[2];
+    npy_intp A_stride = strides[0];
+    npy_intp B_stride = strides[1];
+    npy_intp C_stride = strides[2];
 
     // Extract core strides for matrix dimensions
-    npy_intp A_row_stride = strides[3];  // Stride along m dimension of A
-    npy_intp A_col_stride = strides[4];  // Stride along n dimension of A
-    npy_intp B_row_stride = strides[5];  // Stride along n dimension of B
-    npy_intp B_col_stride = strides[6];  // Stride along p dimension of B
-    npy_intp C_row_stride = strides[7];  // Stride along m dimension of C
-    npy_intp C_col_stride = strides[8];  // Stride along p dimension of C
-
-    // Get backend from descriptor
+    npy_intp A_row_stride = strides[3];
+    npy_intp A_col_stride = strides[4];
+    npy_intp B_row_stride = strides[5];
+    npy_intp B_col_stride = strides[6];
+    npy_intp C_row_stride = strides[7];
+    npy_intp C_col_stride = strides[8];
+
+    // Note: B_col_stride and C_col_stride not needed for row-major QBLAS calls
+
+    // Get backend from descriptor (should be SLEEF only)
     QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
-
-    // Process each batch
-    for (npy_intp batch = 0; batch < N; batch++) {
-        char *A_batch = data[0] + batch * A_batch_stride;
-        char *B_batch = data[1] + batch * B_batch_stride;
-        char *C_batch = data[2] + batch * C_batch_stride;
-
-        // Perform matrix multiplication: C = A @ B
-        // C[i,j] = sum_k(A[i,k] * B[k,j])
-        for (npy_intp i = 0; i < m; i++) {
-            for (npy_intp j = 0; j < p; j++) {
-                char *C_ij = C_batch + i * C_row_stride + j * C_col_stride;
-
-                if (backend == BACKEND_SLEEF) {
-                    Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);  // Initialize to 0
-
-                    for (npy_intp k = 0; k < n; k++) {
-                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
-                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
-
-                        Sleef_quad a_val, b_val;
-                        memcpy(&a_val, A_ik, sizeof(Sleef_quad));
-                        memcpy(&b_val, B_kj, sizeof(Sleef_quad));
-
-                        // sum += A[i,k] * B[k,j]
-                        sum = Sleef_addq1_u05(sum, Sleef_mulq1_u05(a_val, b_val));
-                    }
-
-                    memcpy(C_ij, &sum, sizeof(Sleef_quad));
-                }
-                else {
-                    // Long double backend
-                    long double sum = 0.0L;
-
-                    for (npy_intp k = 0; k < n; k++) {
-                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
-                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
-
-                        long double a_val, b_val;
-                        memcpy(&a_val, A_ik, sizeof(long double));
-                        memcpy(&b_val, B_kj, sizeof(long double));
-
-                        sum += a_val * b_val;
-                    }
-
-                    memcpy(C_ij, &sum, sizeof(long double));
-                }
-            }
+    if (descr->backend != BACKEND_SLEEF) {
+        PyErr_SetString(PyExc_RuntimeError, "Internal error: non-SLEEF backend in QBLAS matmul");
+        return -1;
+    }
+
+    // Determine operation type
+    MatmulOperationType op_type = determine_operation_type(m, n, p);
+
+    // Constants for QBLAS
+    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
+    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
+
+    // print all information for debugging
+    printf("DEBUG: Performing %ld batch operations with dimensions (%ld, %ld, %ld)\n", (long)N,
+           (long)m, (long)n, (long)p);
+    printf("DEBUG: Strides - A: (%ld, %ld), B: (%ld, %ld), C: (%ld, %ld)\n", (long)A_row_stride,
+           (long)A_col_stride, (long)B_row_stride, (long)B_col_stride, (long)C_row_stride,
+           (long)C_col_stride);
+    printf("DEBUG: Operation type: %d\n", op_type);
+
+    char *A = data[0];
+    char *B = data[1];
+    char *C = data[2];
+
+    Sleef_quad *A_ptr = (Sleef_quad *)A;
+    Sleef_quad *B_ptr = (Sleef_quad *)B;
+    Sleef_quad *C_ptr = (Sleef_quad *)C;
+
+    int result = -1;
+
+    switch (op_type) {
+        case MATMUL_DOT: {
+            // Vector dot product: C = A^T * B (both are vectors)
+            // A has shape (1, n), B has shape (n, 1), C has shape (1, 1)
+
+            printf("DEBUG: Using QBLAS dot product for %ld elements\n", (long)n);
+
+            // A is effectively a vector of length n
+            // B is effectively a vector of length n
+            size_t incx = A_col_stride / sizeof(Sleef_quad);
+            size_t incy = B_row_stride / sizeof(Sleef_quad);
+
+            result = qblas_dot(n, A_ptr, incx, B_ptr, incy, C_ptr);
+            break;
         }
+
+        case MATMUL_GEMV: {
+            // Matrix-vector multiplication: C = A * B
+            // A has shape (m, n), B has shape (n, 1), C has shape (m, 1)
+
+            printf("DEBUG: Using QBLAS GEMV for %ldx%ld matrix times %ld vector\n", (long)m,
+                   (long)n, (long)n);
+
+            size_t lda = A_row_stride / sizeof(Sleef_quad);
+            size_t incx = B_row_stride / sizeof(Sleef_quad);
+            size_t incy = C_row_stride / sizeof(Sleef_quad);
+
+            result =
+                    qblas_gemv('R', 'N', m, n, &alpha, A_ptr, lda, B_ptr, incx, &beta, C_ptr, incy);
+            break;
+        }
+
+        case MATMUL_GEMM: {
+            // Matrix-matrix multiplication: C = A * B
+            // A has shape (m, n), B has shape (n, p), C has shape (m, p)
+
+            printf("DEBUG: Using QBLAS GEMM for %ldx%ldx%ld matrices\n", (long)m, (long)n, (long)p);
+
+            size_t lda = A_row_stride / sizeof(Sleef_quad);
+            size_t ldb = B_row_stride / sizeof(Sleef_quad);
+            size_t ldc = C_row_stride / sizeof(Sleef_quad);
+
+            result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, &beta,
+                                C_ptr, ldc);
+            break;
+        }
+    }
+
+    if (result != 0) {
+        PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed");
+        return -1;
     }
 
     return 0;
 }
 
 /**
- * Register matmul support following the exact same pattern as binary_ops.cpp
+ * Register matmul support with QBLAS acceleration
  */
 int
 init_matmul_ops(PyObject *numpy)
 {
-    printf("DEBUG: init_matmul_ops - registering matmul using NumPy 2.0 API\n");
+    printf("DEBUG: init_matmul_ops - registering QBLAS-accelerated matmul\n");
 
-    // Get the existing matmul ufunc - same pattern as binary_ops
+    // Get the existing matmul ufunc
     PyObject *ufunc = PyObject_GetAttrString(numpy, "matmul");
     if (ufunc == NULL) {
         printf("DEBUG: Failed to get numpy.matmul\n");
         return -1;
     }
 
-    // Use the same pattern as binary_ops.cpp
+    // Setup method specification for QBLAS-accelerated matmul
     PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
 
     PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
@@ -204,7 +245,7 @@ init_matmul_ops(PyObject *numpy)
                            {0, NULL}};
 
     PyArrayMethod_Spec Spec = {
-            .name = "quad_matmul",
+            .name = "quad_matmul_qblas",
             .nin = 2,
             .nout = 1,
             .casting = NPY_NO_CASTING,
@@ -213,17 +254,17 @@ init_matmul_ops(PyObject *numpy)
             .slots = slots,
     };
 
-    printf("DEBUG: About to add loop to matmul ufunc...\n");
+    printf("DEBUG: About to add QBLAS loop to matmul ufunc...\n");
 
     if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
-        printf("DEBUG: Failed to add loop to matmul ufunc\n");
+        printf("DEBUG: Failed to add QBLAS loop to matmul ufunc\n");
         Py_DECREF(ufunc);
         return -1;
     }
 
-    printf("DEBUG: Successfully added matmul loop!\n");
+    printf("DEBUG: Successfully added QBLAS matmul loop!\n");
 
-    // Add promoter following binary_ops pattern
+    // Add promoter
     PyObject *promoter_capsule =
             PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL);
     if (promoter_capsule == NULL) {
@@ -250,6 +291,6 @@ init_matmul_ops(PyObject *numpy)
     Py_DECREF(promoter_capsule);
     Py_DECREF(ufunc);
 
-    printf("DEBUG: init_matmul_ops completed successfully\n");
+    printf("DEBUG: init_matmul_ops completed successfully with QBLAS acceleration\n");
     return 0;
 }
\ No newline at end of file

From 742ce642aa173487a8253da10518e30419621ec3 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 17:38:59 +0530
Subject: [PATCH 38/49] matmul ufunc completed, naive plugged, qblas
 experimental

---
 quaddtype/numpy_quaddtype/QBLAS               |   2 +-
 .../src/quadblas_interface.cpp                |   2 -
 .../numpy_quaddtype/src/umath/matmul.cpp      | 189 +++++++++++-------
 3 files changed, 116 insertions(+), 77 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS
index 4853ac1..9468e24 160000
--- a/quaddtype/numpy_quaddtype/QBLAS
+++ b/quaddtype/numpy_quaddtype/QBLAS
@@ -1 +1 @@
-Subproject commit 4853ac1c7d3fa3016b61e9f2b9a43f49c06d891d
+Subproject commit 9468e24a02b731563eba2aee0350e9219b36c102
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index 185e2d8..7ef618f 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -75,7 +75,6 @@ qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k,
     }
 
     try {
-        // Convert layout
         QuadBLAS::Layout qblas_layout;
         if (layout == 'R' || layout == 'r') {
             qblas_layout = QuadBLAS::Layout::RowMajor;
@@ -93,7 +92,6 @@ qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k,
             return -1;  // Transpose not implemented yet
         }
 
-        // Call QBLAS GEMM
         QuadBLAS::gemm(qblas_layout, m, n, k, *alpha, A, lda, B, ldb, *beta, C, ldc);
 
         return 0;
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index d1adc7f..e192a67 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -24,10 +24,6 @@ extern "C" {
 #include "promoters.hpp"
 #include "../quadblas_interface.h"
 
-/**
- * Resolve descriptors for matmul operation.
- * Only supports SLEEF backend when QBLAS is enabled.
- */
 static NPY_CASTING
 quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
                                 PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
@@ -76,23 +72,15 @@ quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[
     return casting;
 }
 
-/**
- * Determine the type of operation based on input dimensions
- */
 enum MatmulOperationType {
-    MATMUL_DOT,   // 1D x 1D -> scalar
-    MATMUL_GEMV,  // 2D x 1D -> 1D
-    MATMUL_GEMM   // 2D x 2D -> 2D
+    MATMUL_DOT,
+    MATMUL_GEMV,
+    MATMUL_GEMM
 };
 
 static MatmulOperationType
 determine_operation_type(npy_intp m, npy_intp n, npy_intp p)
 {
-    // For matmul signature (m?,n),(n,p?)->(m?,p?):
-    // - If m=1 and p=1: vector dot product (1D x 1D)
-    // - If p=1: matrix-vector multiplication (2D x 1D)
-    // - Otherwise: matrix-matrix multiplication (2D x 2D)
-
     if (m == 1 && p == 1) {
         return MATMUL_DOT;
     }
@@ -104,10 +92,6 @@ determine_operation_type(npy_intp m, npy_intp n, npy_intp p)
     }
 }
 
-/**
- * Matrix multiplication strided loop using QBLAS.
- * Automatically selects the appropriate QBLAS operation based on input dimensions.
- */
 static int
 quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
                          npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata)
@@ -118,12 +102,12 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
     npy_intp n = dimensions[2];  // Cols of first matrix / rows of second matrix
     npy_intp p = dimensions[3];  // Cols of second matrix
 
-    // Extract batch strides
+    // batch strides
     npy_intp A_stride = strides[0];
     npy_intp B_stride = strides[1];
     npy_intp C_stride = strides[2];
 
-    // Extract core strides for matrix dimensions
+    // core strides for matrix dimensions
     npy_intp A_row_stride = strides[3];
     npy_intp A_col_stride = strides[4];
     npy_intp B_row_stride = strides[5];
@@ -131,30 +115,16 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
     npy_intp C_row_stride = strides[7];
     npy_intp C_col_stride = strides[8];
 
-    // Note: B_col_stride and C_col_stride not needed for row-major QBLAS calls
-
-    // Get backend from descriptor (should be SLEEF only)
     QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
     if (descr->backend != BACKEND_SLEEF) {
         PyErr_SetString(PyExc_RuntimeError, "Internal error: non-SLEEF backend in QBLAS matmul");
         return -1;
     }
 
-    // Determine operation type
     MatmulOperationType op_type = determine_operation_type(m, n, p);
-
-    // Constants for QBLAS
     Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
     Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
 
-    // print all information for debugging
-    printf("DEBUG: Performing %ld batch operations with dimensions (%ld, %ld, %ld)\n", (long)N,
-           (long)m, (long)n, (long)p);
-    printf("DEBUG: Strides - A: (%ld, %ld), B: (%ld, %ld), C: (%ld, %ld)\n", (long)A_row_stride,
-           (long)A_col_stride, (long)B_row_stride, (long)B_col_stride, (long)C_row_stride,
-           (long)C_col_stride);
-    printf("DEBUG: Operation type: %d\n", op_type);
-
     char *A = data[0];
     char *B = data[1];
     char *C = data[2];
@@ -167,13 +137,6 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
 
     switch (op_type) {
         case MATMUL_DOT: {
-            // Vector dot product: C = A^T * B (both are vectors)
-            // A has shape (1, n), B has shape (n, 1), C has shape (1, 1)
-
-            printf("DEBUG: Using QBLAS dot product for %ld elements\n", (long)n);
-
-            // A is effectively a vector of length n
-            // B is effectively a vector of length n
             size_t incx = A_col_stride / sizeof(Sleef_quad);
             size_t incy = B_row_stride / sizeof(Sleef_quad);
 
@@ -182,12 +145,6 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
         }
 
         case MATMUL_GEMV: {
-            // Matrix-vector multiplication: C = A * B
-            // A has shape (m, n), B has shape (n, 1), C has shape (m, 1)
-
-            printf("DEBUG: Using QBLAS GEMV for %ldx%ld matrix times %ld vector\n", (long)m,
-                   (long)n, (long)n);
-
             size_t lda = A_row_stride / sizeof(Sleef_quad);
             size_t incx = B_row_stride / sizeof(Sleef_quad);
             size_t incy = C_row_stride / sizeof(Sleef_quad);
@@ -198,17 +155,46 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
         }
 
         case MATMUL_GEMM: {
-            // Matrix-matrix multiplication: C = A * B
-            // A has shape (m, n), B has shape (n, p), C has shape (m, p)
-
-            printf("DEBUG: Using QBLAS GEMM for %ldx%ldx%ld matrices\n", (long)m, (long)n, (long)p);
-
             size_t lda = A_row_stride / sizeof(Sleef_quad);
             size_t ldb = B_row_stride / sizeof(Sleef_quad);
-            size_t ldc = C_row_stride / sizeof(Sleef_quad);
+            size_t ldc_numpy = C_row_stride / sizeof(Sleef_quad);
+
+            Sleef_quad *temp_A_buffer = new Sleef_quad[m * n];
+            if (!temp_A_buffer) {
+                PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffer for GEMM");
+                delete[] temp_A_buffer;
+                return -1;
+            }
+            Sleef_quad *temp_B_buffer = new Sleef_quad[n * p];
+            if (!temp_B_buffer) {
+                PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffer for GEMM");
+                delete[] temp_A_buffer;
+                return -1;
+            }
+            memcpy(temp_A_buffer, A_ptr, m * n * sizeof(Sleef_quad));
+            memcpy(temp_B_buffer, B_ptr, n * p * sizeof(Sleef_quad));
+            A_ptr = temp_A_buffer;
+            B_ptr = temp_B_buffer;
+
+            Sleef_quad *temp_C_buffer = new Sleef_quad[m * p];
+            if (!temp_C_buffer) {
+                PyErr_SetString(PyExc_MemoryError,
+                                "Failed to allocate temporary buffer for GEMM result");
+                return -1;
+            }
+
+            size_t ldc_temp = p;
 
             result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, &beta,
-                                C_ptr, ldc);
+                                temp_C_buffer, ldc_temp);
+
+            if (result == 0) {
+                memcpy(C_ptr, temp_C_buffer, m * p * sizeof(Sleef_quad));
+            }
+
+            delete[] temp_C_buffer;
+            delete[] temp_A_buffer;
+            delete[] temp_B_buffer;
             break;
         }
     }
@@ -221,27 +207,91 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
     return 0;
 }
 
-/**
- * Register matmul support with QBLAS acceleration
- */
+static int
+naive_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
+                          npy_intp const dimensions[], npy_intp const strides[],
+                          NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    npy_intp m = dimensions[1];
+    npy_intp n = dimensions[2];
+    npy_intp p = dimensions[3];
+
+    npy_intp A_batch_stride = strides[0];
+    npy_intp B_batch_stride = strides[1];
+    npy_intp C_batch_stride = strides[2];
+
+    npy_intp A_row_stride = strides[3];
+    npy_intp A_col_stride = strides[4];
+    npy_intp B_row_stride = strides[5];
+    npy_intp B_col_stride = strides[6];
+    npy_intp C_row_stride = strides[7];
+    npy_intp C_col_stride = strides[8];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    for (npy_intp batch = 0; batch < N; batch++) {
+        char *A_batch = data[0] + batch * A_batch_stride;
+        char *B_batch = data[1] + batch * B_batch_stride;
+        char *C_batch = data[2] + batch * C_batch_stride;
+
+        for (npy_intp i = 0; i < m; i++) {
+            for (npy_intp j = 0; j < p; j++) {
+                char *C_ij = C_batch + i * C_row_stride + j * C_col_stride;
+
+                if (backend == BACKEND_SLEEF) {
+                    Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);
+
+                    for (npy_intp k = 0; k < n; k++) {
+                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
+                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+
+                        Sleef_quad a_val, b_val;
+                        memcpy(&a_val, A_ik, sizeof(Sleef_quad));
+                        memcpy(&b_val, B_kj, sizeof(Sleef_quad));
+                        sum = Sleef_fmaq1_u05(a_val, b_val, sum);
+                    }
+
+                    memcpy(C_ij, &sum, sizeof(Sleef_quad));
+                }
+                else {
+                    long double sum = 0.0L;
+
+                    for (npy_intp k = 0; k < n; k++) {
+                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
+                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+
+                        long double a_val, b_val;
+                        memcpy(&a_val, A_ik, sizeof(long double));
+                        memcpy(&b_val, B_kj, sizeof(long double));
+
+                        sum += a_val * b_val;
+                    }
+
+                    memcpy(C_ij, &sum, sizeof(long double));
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
 int
 init_matmul_ops(PyObject *numpy)
 {
-    printf("DEBUG: init_matmul_ops - registering QBLAS-accelerated matmul\n");
-
-    // Get the existing matmul ufunc
     PyObject *ufunc = PyObject_GetAttrString(numpy, "matmul");
     if (ufunc == NULL) {
-        printf("DEBUG: Failed to get numpy.matmul\n");
         return -1;
     }
 
-    // Setup method specification for QBLAS-accelerated matmul
     PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
 
     PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
-                           {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop},
-                           {NPY_METH_unaligned_strided_loop, (void *)&quad_matmul_strided_loop},
+                           {NPY_METH_strided_loop, (void *)&naive_matmul_strided_loop},
+                           {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},
                            {0, NULL}};
 
     PyArrayMethod_Spec Spec = {
@@ -254,17 +304,11 @@ init_matmul_ops(PyObject *numpy)
             .slots = slots,
     };
 
-    printf("DEBUG: About to add QBLAS loop to matmul ufunc...\n");
-
     if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
-        printf("DEBUG: Failed to add QBLAS loop to matmul ufunc\n");
         Py_DECREF(ufunc);
         return -1;
     }
 
-    printf("DEBUG: Successfully added QBLAS matmul loop!\n");
-
-    // Add promoter
     PyObject *promoter_capsule =
             PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL);
     if (promoter_capsule == NULL) {
@@ -280,17 +324,14 @@ init_matmul_ops(PyObject *numpy)
     }
 
     if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
-        printf("DEBUG: Failed to add promoter (continuing anyway)\n");
         PyErr_Clear();  // Don't fail if promoter fails
     }
     else {
-        printf("DEBUG: Successfully added promoter\n");
     }
 
     Py_DECREF(DTypes);
     Py_DECREF(promoter_capsule);
     Py_DECREF(ufunc);
 
-    printf("DEBUG: init_matmul_ops completed successfully with QBLAS acceleration\n");
     return 0;
 }
\ No newline at end of file

From d993bc92bb760c4e0428ac9542ed97272247a4af Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 17:47:11 +0530
Subject: [PATCH 39/49] adding release tracker to keep record for tasks, v1.0.0

---
 quaddtype/release_tracker.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/quaddtype/release_tracker.md b/quaddtype/release_tracker.md
index 1ecf7d3..3ed1004 100644
--- a/quaddtype/release_tracker.md
+++ b/quaddtype/release_tracker.md
@@ -1,4 +1,4 @@
-# Plan for `numpy-quaddtype` v1.5
+# Plan for `numpy-quaddtype` v1.0.0
 
 | ufunc name    | Added |
 | ------------- | ----- |
@@ -91,3 +91,5 @@
 | floor         | ✅    |
 | ceil          | ✅    |
 | trunc         | ✅    |
+
+- Fixing QBLAS integration to work unaligned arrays without or recovering from bad allocation fallback

From c518a29281de14955e8125cb8845200d9a8b46a9 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 15:33:46 +0000
Subject: [PATCH 40/49] it should be failing but passes on x86-64

---
 quaddtype/numpy_quaddtype/QBLAS                | 2 +-
 quaddtype/numpy_quaddtype/src/umath/matmul.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS
index 9468e24..0eabb67 160000
--- a/quaddtype/numpy_quaddtype/QBLAS
+++ b/quaddtype/numpy_quaddtype/QBLAS
@@ -1 +1 @@
-Subproject commit 9468e24a02b731563eba2aee0350e9219b36c102
+Subproject commit 0eabb677431c6148434c50deba7abd6902d74b16
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index e192a67..f31ec89 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -290,7 +290,7 @@ init_matmul_ops(PyObject *numpy)
     PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
 
     PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
-                           {NPY_METH_strided_loop, (void *)&naive_matmul_strided_loop},
+                           {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop},
                            {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},
                            {0, NULL}};
 

From bbce2ac58f930ad04a89ea877ef2013f5b49c8c7 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 18:17:04 +0000
Subject: [PATCH 41/49] ahh stupid me :), fallback to naive for MSVC

---
 quaddtype/numpy_quaddtype/__init__.py         |  6 +-
 .../src/quadblas_interface.cpp                | 94 ++++++++++++++++++-
 .../numpy_quaddtype/src/quadblas_interface.h  |  5 +
 .../numpy_quaddtype/src/umath/matmul.cpp      | 11 +++
 4 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/__init__.py b/quaddtype/numpy_quaddtype/__init__.py
index 8b588c1..878180b 100644
--- a/quaddtype/numpy_quaddtype/__init__.py
+++ b/quaddtype/numpy_quaddtype/__init__.py
@@ -39,8 +39,4 @@ def LongDoubleQuadPrecDType():
 ln10 = get_sleef_constant("ln10")
 max_value = get_sleef_constant("quad_max")
 min_value = get_sleef_constant("quad_min")
-epsilon = get_sleef_constant("epsilon")
-
-num_cores = multiprocessing.cpu_count()
-# set default number of threads for QuadBLAS
-set_num_threads(num_cores)
\ No newline at end of file
+epsilon = get_sleef_constant("epsilon")
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index 7ef618f..6eb5757 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -1,10 +1,16 @@
 #include "quadblas_interface.h"
-#include "../QBLAS/include/quadblas/quadblas.hpp"
 #include <cstring>
 #include <algorithm>
 
+#ifndef DISABLE_QUADBLAS
+#include "../QBLAS/include/quadblas/quadblas.hpp"
+#endif // DISABLE_QUADBLAS
+
 extern "C" {
 
+
+#ifndef  DISABLE_QUADBLAS
+
 int
 qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result)
 {
@@ -138,4 +144,90 @@ py_quadblas_get_version(PyObject *self, PyObject *args)
     return PyUnicode_FromString("QuadBLAS 1.0.0 - High Performance Quad Precision BLAS");
 }
 
+int
+quadblas_set_num_threads(int num_threads)
+{
+    QuadBLAS::set_num_threads(num_threads);
+    return 0;
+}
+
+int
+quadblas_get_num_threads(void)
+{
+    int num_threads = QuadBLAS::get_num_threads();
+    return num_threads;
+}
+
+#else  // DISABLE_QUADBLAS
+
+
+int
+qblas_dot(size_t n, Sleef_quad *x, size_t incx, Sleef_quad *y, size_t incy, Sleef_quad *result)
+{
+    return -1;  // QBLAS is disabled, dot product not available
+}
+
+int
+qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *alpha, Sleef_quad *A,
+           size_t lda, Sleef_quad *x, size_t incx, Sleef_quad *beta, Sleef_quad *y, size_t incy)
+{
+    return -1;  // QBLAS is disabled, GEMV not available
+}
+
+int
+qblas_gemm(char layout, char transa, char transb, size_t m, size_t n, size_t k, Sleef_quad *alpha,
+           Sleef_quad *A, size_t lda, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C,
+           size_t ldc)
+{
+    return -1;  // QBLAS is disabled, GEMM not available
+}
+
+int
+qblas_supports_backend(QuadBackendType backend)
+{
+    return -1; // QBLAS is disabled, backend support not available
+}
+
+PyObject *
+py_quadblas_set_num_threads(PyObject *self, PyObject *args)
+{
+    // raise error
+    PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
+    return NULL;
+}
+
+PyObject *
+py_quadblas_get_num_threads(PyObject *self, PyObject *args)
+{
+    // raise error
+    PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
+    return NULL;
+}
+
+PyObject *
+py_quadblas_get_version(PyObject *self, PyObject *args)
+{
+    // raise error
+    PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
+    return NULL;
+}
+
+int
+quadblas_set_num_threads(int num_threads)
+{
+    // raise error
+    PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
+    return -1;
+}
+
+int
+quadblas_get_num_threads(void)
+{
+    // raise error
+    PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
+    return -1;
+}
+
+#endif // DISABLE_QUADBLAS
+
 }  // extern "C"
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.h b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
index ff9ed53..82a685a 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.h
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
@@ -32,6 +32,11 @@ py_quadblas_get_num_threads(PyObject *self, PyObject *args);
 PyObject *
 py_quadblas_get_version(PyObject *self, PyObject *args);
 
+int
+quadblas_set_num_threads(int num_threads);
+int
+quadblas_get_num_threads(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index f31ec89..c9fec50 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -289,10 +289,21 @@ init_matmul_ops(PyObject *numpy)
 
     PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
 
+    #ifndef DISABLE_QUADBLAS
+    // set threading to max
+    int num_threads = quadblas_get_num_threads();
+    quadblas_set_num_threads(num_threads);
+    
     PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
                            {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop},
                            {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},
                            {0, NULL}};
+    #else
+    PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
+                           {NPY_METH_strided_loop, (void *)&naive_matmul_strided_loop},
+                           {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},
+                           {0, NULL}};
+    #endif // DISABLE_QUADBLAS
 
     PyArrayMethod_Spec Spec = {
             .name = "quad_matmul_qblas",

From 5e5fa659fcce5e3daa3a1d7cd771244290ea8531 Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Sat, 19 Jul 2025 18:26:27 +0000
Subject: [PATCH 42/49] switching to internal function use only

---
 quaddtype/numpy_quaddtype/src/quadblas_interface.cpp | 8 ++++----
 quaddtype/numpy_quaddtype/src/quadblas_interface.h   | 4 ++--
 quaddtype/numpy_quaddtype/src/umath/matmul.cpp       | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
index 6eb5757..65feb60 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.cpp
@@ -145,14 +145,14 @@ py_quadblas_get_version(PyObject *self, PyObject *args)
 }
 
 int
-quadblas_set_num_threads(int num_threads)
+_quadblas_set_num_threads(int num_threads)
 {
     QuadBLAS::set_num_threads(num_threads);
     return 0;
 }
 
 int
-quadblas_get_num_threads(void)
+_quadblas_get_num_threads(void)
 {
     int num_threads = QuadBLAS::get_num_threads();
     return num_threads;
@@ -213,7 +213,7 @@ py_quadblas_get_version(PyObject *self, PyObject *args)
 }
 
 int
-quadblas_set_num_threads(int num_threads)
+_quadblas_set_num_threads(int num_threads)
 {
     // raise error
     PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
@@ -221,7 +221,7 @@ quadblas_set_num_threads(int num_threads)
 }
 
 int
-quadblas_get_num_threads(void)
+_quadblas_get_num_threads(void)
 {
     // raise error
     PyErr_SetString(PyExc_NotImplementedError, "QuadBLAS is disabled");
diff --git a/quaddtype/numpy_quaddtype/src/quadblas_interface.h b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
index 82a685a..76033eb 100644
--- a/quaddtype/numpy_quaddtype/src/quadblas_interface.h
+++ b/quaddtype/numpy_quaddtype/src/quadblas_interface.h
@@ -33,9 +33,9 @@ PyObject *
 py_quadblas_get_version(PyObject *self, PyObject *args);
 
 int
-quadblas_set_num_threads(int num_threads);
+_quadblas_set_num_threads(int num_threads);
 int
-quadblas_get_num_threads(void);
+_quadblas_get_num_threads(void);
 
 #ifdef __cplusplus
 }
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index c9fec50..98053b8 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -291,9 +291,9 @@ init_matmul_ops(PyObject *numpy)
 
     #ifndef DISABLE_QUADBLAS
     // set threading to max
-    int num_threads = quadblas_get_num_threads();
-    quadblas_set_num_threads(num_threads);
-    
+    int num_threads = _quadblas_get_num_threads();
+    _quadblas_set_num_threads(num_threads);
+
     PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
                            {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop},
                            {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},

From cec5ace3a011cb38c57bf0cd33b99b5df0d36cc6 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sun, 20 Jul 2025 03:46:52 +0530
Subject: [PATCH 43/49] this should fix them all

---
 quaddtype/numpy_quaddtype/QBLAS               |   2 +-
 .../numpy_quaddtype/src/umath/matmul.cpp      | 216 +++++++++++++-----
 2 files changed, 160 insertions(+), 58 deletions(-)

diff --git a/quaddtype/numpy_quaddtype/QBLAS b/quaddtype/numpy_quaddtype/QBLAS
index 0eabb67..9468e24 160000
--- a/quaddtype/numpy_quaddtype/QBLAS
+++ b/quaddtype/numpy_quaddtype/QBLAS
@@ -1 +1 @@
-Subproject commit 0eabb677431c6148434c50deba7abd6902d74b16
+Subproject commit 9468e24a02b731563eba2aee0350e9219b36c102
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index 98053b8..354a342 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -93,8 +93,9 @@ determine_operation_type(npy_intp m, npy_intp n, npy_intp p)
 }
 
 static int
-quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
-                         npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata)
+quad_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                 npy_intp const dimensions[], npy_intp const strides[],
+                                 NpyAuxData *auxdata)
 {
     // Extract dimensions
     npy_intp N = dimensions[0];  // Batch size, this remains always 1 for matmul afaik
@@ -149,6 +150,8 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
             size_t incx = B_row_stride / sizeof(Sleef_quad);
             size_t incy = C_row_stride / sizeof(Sleef_quad);
 
+            memset(C_ptr, 0, m * p * sizeof(Sleef_quad));
+
             result =
                     qblas_gemv('R', 'N', m, n, &alpha, A_ptr, lda, B_ptr, incx, &beta, C_ptr, incy);
             break;
@@ -159,32 +162,132 @@ quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
             size_t ldb = B_row_stride / sizeof(Sleef_quad);
             size_t ldc_numpy = C_row_stride / sizeof(Sleef_quad);
 
+            memset(C_ptr, 0, m * p * sizeof(Sleef_quad));
+
+            size_t ldc_temp = p;
+
+            result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, &beta,
+                                C_ptr, ldc_numpy);
+            break;
+        }
+    }
+
+    if (result != 0) {
+        PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int
+quad_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                   npy_intp const dimensions[], npy_intp const strides[],
+                                   NpyAuxData *auxdata)
+{
+    // Extract dimensions
+    npy_intp N = dimensions[0];  // Batch size, this remains always 1 for matmul afaik
+    npy_intp m = dimensions[1];  // Rows of first matrix
+    npy_intp n = dimensions[2];  // Cols of first matrix / rows of second matrix
+    npy_intp p = dimensions[3];  // Cols of second matrix
+
+    // batch strides
+    npy_intp A_stride = strides[0];
+    npy_intp B_stride = strides[1];
+    npy_intp C_stride = strides[2];
+
+    // core strides for matrix dimensions
+    npy_intp A_row_stride = strides[3];
+    npy_intp A_col_stride = strides[4];
+    npy_intp B_row_stride = strides[5];
+    npy_intp B_col_stride = strides[6];
+    npy_intp C_row_stride = strides[7];
+    npy_intp C_col_stride = strides[8];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    if (descr->backend != BACKEND_SLEEF) {
+        PyErr_SetString(PyExc_RuntimeError, "Internal error: non-SLEEF backend in QBLAS matmul");
+        return -1;
+    }
+
+    MatmulOperationType op_type = determine_operation_type(m, n, p);
+    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
+    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
+
+    char *A = data[0];
+    char *B = data[1];
+    char *C = data[2];
+
+    Sleef_quad *A_ptr = (Sleef_quad *)A;
+    Sleef_quad *B_ptr = (Sleef_quad *)B;
+    Sleef_quad *C_ptr = (Sleef_quad *)C;
+
+    int result = -1;
+
+    switch (op_type) {
+        case MATMUL_DOT: {
+            Sleef_quad *temp_A_buffer = new Sleef_quad[n];
+            Sleef_quad *temp_B_buffer = new Sleef_quad[n];
+
+            memcpy(temp_A_buffer, A_ptr, n * sizeof(Sleef_quad));
+            memcpy(temp_B_buffer, B_ptr, n * sizeof(Sleef_quad));
+
+            size_t incx = 1;
+            size_t incy = 1;
+
+            result = qblas_dot(n, temp_A_buffer, incx, temp_B_buffer, incy, C_ptr);
+
+            delete[] temp_A_buffer;
+            delete[] temp_B_buffer;
+            break;
+        }
+
+        case MATMUL_GEMV: {
+            size_t lda = A_row_stride / sizeof(Sleef_quad);
+            size_t incx = B_row_stride / sizeof(Sleef_quad);
+            size_t incy = C_row_stride / sizeof(Sleef_quad);
+
             Sleef_quad *temp_A_buffer = new Sleef_quad[m * n];
-            if (!temp_A_buffer) {
-                PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffer for GEMM");
-                delete[] temp_A_buffer;
-                return -1;
-            }
             Sleef_quad *temp_B_buffer = new Sleef_quad[n * p];
-            if (!temp_B_buffer) {
-                PyErr_SetString(PyExc_MemoryError, "Failed to allocate temporary buffer for GEMM");
-                delete[] temp_A_buffer;
-                return -1;
-            }
             memcpy(temp_A_buffer, A_ptr, m * n * sizeof(Sleef_quad));
             memcpy(temp_B_buffer, B_ptr, n * p * sizeof(Sleef_quad));
             A_ptr = temp_A_buffer;
             B_ptr = temp_B_buffer;
 
+            // Use temp_C_buffer to avoid unaligned writes
             Sleef_quad *temp_C_buffer = new Sleef_quad[m * p];
-            if (!temp_C_buffer) {
-                PyErr_SetString(PyExc_MemoryError,
-                                "Failed to allocate temporary buffer for GEMM result");
-                return -1;
-            }
 
+            lda = n;
+            incx = 1;
+            incy = 1;
+
+            memset(temp_C_buffer, 0, m * p * sizeof(Sleef_quad));
+
+            result = qblas_gemv('R', 'N', m, n, &alpha, A_ptr, lda, B_ptr, incx, &beta,
+                                temp_C_buffer, incy);
+            break;
+        }
+
+        case MATMUL_GEMM: {
+            size_t lda = A_row_stride / sizeof(Sleef_quad);
+            size_t ldb = B_row_stride / sizeof(Sleef_quad);
+            size_t ldc_numpy = C_row_stride / sizeof(Sleef_quad);
+
+            Sleef_quad *temp_A_buffer = new Sleef_quad[m * n];
+            Sleef_quad *temp_B_buffer = new Sleef_quad[n * p];
+            memcpy(temp_A_buffer, A_ptr, m * n * sizeof(Sleef_quad));
+            memcpy(temp_B_buffer, B_ptr, n * p * sizeof(Sleef_quad));
+            A_ptr = temp_A_buffer;
+            B_ptr = temp_B_buffer;
+
+            // since these are now contiguous so,
+            lda = n;
+            ldb = p;
             size_t ldc_temp = p;
 
+            Sleef_quad *temp_C_buffer = new Sleef_quad[m * p];
+            memset(temp_C_buffer, 0, m * p * sizeof(Sleef_quad));
+
             result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, &beta,
                                 temp_C_buffer, ldc_temp);
 
@@ -218,8 +321,8 @@ naive_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
     npy_intp p = dimensions[3];
 
     npy_intp A_batch_stride = strides[0];
-    npy_intp B_batch_stride = strides[1];
-    npy_intp C_batch_stride = strides[2];
+    npy_intp B_stride = strides[1];
+    npy_intp C_stride = strides[2];
 
     npy_intp A_row_stride = strides[3];
     npy_intp A_col_stride = strides[4];
@@ -232,46 +335,44 @@ naive_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
     QuadBackendType backend = descr->backend;
     size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
 
-    for (npy_intp batch = 0; batch < N; batch++) {
-        char *A_batch = data[0] + batch * A_batch_stride;
-        char *B_batch = data[1] + batch * B_batch_stride;
-        char *C_batch = data[2] + batch * C_batch_stride;
-
-        for (npy_intp i = 0; i < m; i++) {
-            for (npy_intp j = 0; j < p; j++) {
-                char *C_ij = C_batch + i * C_row_stride + j * C_col_stride;
+    char *A = data[0];
+    char *B = data[1];
+    char *C = data[2];
 
-                if (backend == BACKEND_SLEEF) {
-                    Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);
+    for (npy_intp i = 0; i < m; i++) {
+        for (npy_intp j = 0; j < p; j++) {
+            char *C_ij = C + i * C_row_stride + j * C_col_stride;
 
-                    for (npy_intp k = 0; k < n; k++) {
-                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
-                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+            if (backend == BACKEND_SLEEF) {
+                Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);
 
-                        Sleef_quad a_val, b_val;
-                        memcpy(&a_val, A_ik, sizeof(Sleef_quad));
-                        memcpy(&b_val, B_kj, sizeof(Sleef_quad));
-                        sum = Sleef_fmaq1_u05(a_val, b_val, sum);
-                    }
+                for (npy_intp k = 0; k < n; k++) {
+                    char *A_ik = A + i * A_row_stride + k * A_col_stride;
+                    char *B_kj = B + k * B_row_stride + j * B_col_stride;
 
-                    memcpy(C_ij, &sum, sizeof(Sleef_quad));
+                    Sleef_quad a_val, b_val;
+                    memcpy(&a_val, A_ik, sizeof(Sleef_quad));
+                    memcpy(&b_val, B_kj, sizeof(Sleef_quad));
+                    sum = Sleef_fmaq1_u05(a_val, b_val, sum);
                 }
-                else {
-                    long double sum = 0.0L;
 
-                    for (npy_intp k = 0; k < n; k++) {
-                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
-                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+                memcpy(C_ij, &sum, sizeof(Sleef_quad));
+            }
+            else {
+                long double sum = 0.0L;
 
-                        long double a_val, b_val;
-                        memcpy(&a_val, A_ik, sizeof(long double));
-                        memcpy(&b_val, B_kj, sizeof(long double));
+                for (npy_intp k = 0; k < n; k++) {
+                    char *A_ik = A + i * A_row_stride + k * A_col_stride;
+                    char *B_kj = B + k * B_row_stride + j * B_col_stride;
 
-                        sum += a_val * b_val;
-                    }
+                    long double a_val, b_val;
+                    memcpy(&a_val, A_ik, sizeof(long double));
+                    memcpy(&b_val, B_kj, sizeof(long double));
 
-                    memcpy(C_ij, &sum, sizeof(long double));
+                    sum += a_val * b_val;
                 }
+
+                memcpy(C_ij, &sum, sizeof(long double));
             }
         }
     }
@@ -289,21 +390,22 @@ init_matmul_ops(PyObject *numpy)
 
     PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
 
-    #ifndef DISABLE_QUADBLAS
+#ifndef DISABLE_QUADBLAS
     // set threading to max
     int num_threads = _quadblas_get_num_threads();
     _quadblas_set_num_threads(num_threads);
 
-    PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
-                           {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop},
-                           {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},
-                           {0, NULL}};
-    #else
+    PyType_Slot slots[] = {
+            {NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
+            {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop_aligned},
+            {NPY_METH_unaligned_strided_loop, (void *)&quad_matmul_strided_loop_unaligned},
+            {0, NULL}};
+#else
     PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
                            {NPY_METH_strided_loop, (void *)&naive_matmul_strided_loop},
                            {NPY_METH_unaligned_strided_loop, (void *)&naive_matmul_strided_loop},
                            {0, NULL}};
-    #endif // DISABLE_QUADBLAS
+#endif  // DISABLE_QUADBLAS
 
     PyArrayMethod_Spec Spec = {
             .name = "quad_matmul_qblas",
@@ -335,7 +437,7 @@ init_matmul_ops(PyObject *numpy)
     }
 
     if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
-        PyErr_Clear();  // Don't fail if promoter fails
+        PyErr_Clear();
     }
     else {
     }

From 1fe6c8172582ec2d24d34640afef72f53e2c078d Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sun, 20 Jul 2025 04:14:10 +0530
Subject: [PATCH 44/49] wrapping up

---
 quaddtype/README.md                          |  8 +++++--
 quaddtype/numpy_quaddtype/src/umath/matmul.h | 23 --------------------
 quaddtype/release_tracker.md                 |  2 --
 3 files changed, 6 insertions(+), 27 deletions(-)

diff --git a/quaddtype/README.md b/quaddtype/README.md
index af4ddef..11eef1d 100644
--- a/quaddtype/README.md
+++ b/quaddtype/README.md
@@ -27,7 +27,7 @@ np.array([1,2,3], dtype=QuadPrecDType("longdouble"))
 
 The code needs the quad precision pieces of the sleef library, which
 is not available on most systems by default, so we have to generate
-that first.  The below assumes one has the required pieces to build
+that first. The below assumes one has the required pieces to build
 sleef (cmake and libmpfr-dev), and that one is in the package
 directory locally.
 
@@ -40,6 +40,7 @@ cd ..
 ```
 
 Building the `numpy-quaddtype` package from locally installed sleef:
+
 ```bash
 export SLEEF_DIR=$PWD/sleef/build
 export LIBRARY_PATH=$SLEEF_DIR/lib
@@ -57,10 +58,13 @@ export LDFLAGS="-Wl,-rpath,$SLEEF_DIR/lib -fopenmp -latomic -lpthread"
 export CFLAGS="-fPIC"
 export CXXFLAGS="-fPIC"
 
+# To build without QBLAS (default for MSVC)
+# export CFLAGS="-fPIC -DDISABLE_QUADBLAS"
+# export CXXFLAGS="-fPIC -DDISABLE_QUADBLAS"
+
 python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v'
 
 # Run the tests
 cd ..
 python -m pytest
 ```
-
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.h b/quaddtype/numpy_quaddtype/src/umath/matmul.h
index 947e2c3..1285849 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.h
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.h
@@ -1,35 +1,12 @@
 #ifndef _QUADDTYPE_MATMUL_H
 #define _QUADDTYPE_MATMUL_H
 
-/**
- * Quad Precision Matrix Multiplication for NumPy
- *
- * This module implements matrix multiplication functionality for the QuadPrecDType
- * by registering custom loops with numpy's matmul generalized ufunc.
- *
- * Supports all matmul operation types:
- * - Vector-vector (dot product): (n,) @ (n,) -> scalar
- * - Matrix-vector: (m,n) @ (n,) -> (m,)
- * - Vector-matrix: (n,) @ (n,p) -> (p,)
- * - Matrix-matrix: (m,n) @ (n,p) -> (m,p)
- *
- * Uses naive algorithms optimized for correctness rather than performance.
- * For production use, consider integration with QBLAS optimized routines.
- */
-
 #include <Python.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/**
- * Initialize the matmul operations for the quad precision dtype.
- * This function registers the matmul generalized ufunc with numpy.
- *
- * @param numpy The numpy module object
- * @return 0 on success, -1 on failure
- */
 int
 init_matmul_ops(PyObject *numpy);
 
diff --git a/quaddtype/release_tracker.md b/quaddtype/release_tracker.md
index 3ed1004..fbd3b20 100644
--- a/quaddtype/release_tracker.md
+++ b/quaddtype/release_tracker.md
@@ -91,5 +91,3 @@
 | floor         | ✅    |
 | ceil          | ✅    |
 | trunc         | ✅    |
-
-- Fixing QBLAS integration to work unaligned arrays without or recovering from bad allocation fallback

From 8f16b9904b9360e6daa902cfd400816ce8a759ff Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sun, 20 Jul 2025 04:27:01 +0530
Subject: [PATCH 45/49] updated branch to main

---
 .github/workflows/build_wheels.yml | 2 +-
 .github/workflows/ci.yml           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 0bf55c4..37a8038 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -3,7 +3,7 @@ name: Build Wheels
 on:
   push:
     branches:
-      - dot
+      - main
     tags:
       - "quaddtype-v*"
     paths:
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e42c12c..542093c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,7 +3,7 @@ name: Numpy User DTypes CI
 on:
   push:
     branches:
-      - dot
+      - main
   pull_request:
   workflow_dispatch:
 

From ed47e3368962b45c1a034effcf8af6db811b7270 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sun, 20 Jul 2025 14:21:03 +0530
Subject: [PATCH 46/49] added test coverage in release_tracker.md

---
 quaddtype/release_tracker.md | 196 +++++++++++++++++++----------------
 1 file changed, 105 insertions(+), 91 deletions(-)

diff --git a/quaddtype/release_tracker.md b/quaddtype/release_tracker.md
index fbd3b20..4891685 100644
--- a/quaddtype/release_tracker.md
+++ b/quaddtype/release_tracker.md
@@ -1,93 +1,107 @@
 # Plan for `numpy-quaddtype` v1.0.0
 
-| ufunc name    | Added |
-| ------------- | ----- |
-| add           | ✅    |
-| subtract      | ✅    |
-| multiply      | ✅    |
-| matmul        | #116  |
-| divide        | ✅    |
-| logaddexp     |       |
-| logaddexp2    |       |
-| true_divide   |       |
-| floor_divide  |       |
-| negative      | ✅    |
-| positive      | ✅    |
-| power         | ✅    |
-| float_power   |       |
-| remainder     |       |
-| mod           | ✅    |
-| fmod          |       |
-| divmod        |       |
-| absolute      | ✅    |
-| fabs          |       |
-| rint          | ✅    |
-| sign          |       |
-| heaviside     |       |
-| conj          |       |
-| conjugate     |       |
-| exp           | ✅    |
-| exp2          | ✅    |
-| log           | ✅    |
-| log2          | ✅    |
-| log10         | ✅    |
-| expm1         |       |
-| log1p         | ✅    |
-| sqrt          | ✅    |
-| square        | ✅    |
-| cbrt          |       |
-| reciprocal    |       |
-| gcd           |       |
-| lcm           |       |
-| sin           | ✅    |
-| cos           | ✅    |
-| tan           | ✅    |
-| arcsin        | ✅    |
-| arccos        | ✅    |
-| arctan        | ✅    |
-| arctan2       | ✅    |
-| hypot         |       |
-| sinh          |       |
-| cosh          |       |
-| tanh          |       |
-| arcsinh       |       |
-| arccosh       |       |
-| arctanh       |       |
-| degrees       |       |
-| radians       |       |
-| deg2rad       |       |
-| rad2deg       |       |
-| bitwise_and   |       |
-| bitwise_or    |       |
-| bitwise_xor   |       |
-| invert        |       |
-| left_shift    |       |
-| right_shift   |       |
-| greater       | ✅    |
-| greater_equal | ✅    |
-| less          | ✅    |
-| less_equal    | ✅    |
-| not_equal     | ✅    |
-| equal         | ✅    |
-| logical_and   |       |
-| logical_or    |       |
-| logical_xor   |       |
-| logical_not   |       |
-| maximum       | ✅    |
-| minimum       | ✅    |
-| fmax          |       |
-| fmin          |       |
-| isfinite      |       |
-| isinf         |       |
-| isnan         |       |
-| isnat         |       |
-| signbit       |       |
-| copysign      |       |
-| nextafter     |       |
-| spacing       |       |
-| modf          |       |
-| ldexp         |       |
-| frexp         |       |
-| floor         | ✅    |
-| ceil          | ✅    |
-| trunc         | ✅    |
+| ufunc name    | Added | Edge Cases Tested\*                                                                   |
+| ------------- | ----- | ------------------------------------------------------------------------------------- |
+| add           | ✅    | ✅                                                                                    |
+| subtract      | ✅    | ✅                                                                                    |
+| multiply      | ✅    | ✅                                                                                    |
+| matmul        | #116  | 🟡 _Need: special values (NaN/inf/-0.0), degenerate cases (0×n, 1×1), extreme values_ |
+| divide        | ✅    | ✅                                                                                    |
+| logaddexp     |       |                                                                                       |
+| logaddexp2    |       |                                                                                       |
+| true_divide   |       |                                                                                       |
+| floor_divide  |       |                                                                                       |
+| negative      | ✅    | ✅                                                                                    |
+| positive      | ✅    | ✅                                                                                    |
+| power         | ✅    | ✅                                                                                    |
+| float_power   |       |                                                                                       |
+| remainder     |       |                                                                                       |
+| mod           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/-0.0/large values)_                       |
+| fmod          |       |                                                                                       |
+| divmod        |       |                                                                                       |
+| absolute      | ✅    | ✅                                                                                    |
+| fabs          |       |                                                                                       |
+| rint          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±0.0/halfway cases)_                      |
+| sign          |       |                                                                                       |
+| heaviside     |       |                                                                                       |
+| conj          |       |                                                                                       |
+| conjugate     |       |                                                                                       |
+| exp           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/large +/- values/overflow)_               |
+| exp2          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/large +/- values/overflow)_               |
+| log           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/-values/1)_                             |
+| log2          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/-values/1)_                             |
+| log10         | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/-values/1)_                             |
+| expm1         |       |                                                                                       |
+| log1p         | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/-1/small values)_                         |
+| sqrt          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/-values)_                               |
+| square        | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/large values)_                          |
+| cbrt          |       |                                                                                       |
+| reciprocal    |       |                                                                                       |
+| gcd           |       |                                                                                       |
+| lcm           |       |                                                                                       |
+| sin           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/π multiples/2π range)_                  |
+| cos           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/π multiples/2π range)_                  |
+| tan           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/π/2 asymptotes)_                        |
+| arcsin        | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±1/out-of-domain)_                        |
+| arccos        | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±1/out-of-domain)_                        |
+| arctan        | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/asymptotes)_                            |
+| arctan2       | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/quadrant coverage)_                     |
+| hypot         |       |                                                                                       |
+| sinh          |       |                                                                                       |
+| cosh          |       |                                                                                       |
+| tanh          |       |                                                                                       |
+| arcsinh       |       |                                                                                       |
+| arccosh       |       |                                                                                       |
+| arctanh       |       |                                                                                       |
+| degrees       |       |                                                                                       |
+| radians       |       |                                                                                       |
+| deg2rad       |       |                                                                                       |
+| rad2deg       |       |                                                                                       |
+| bitwise_and   |       |                                                                                       |
+| bitwise_or    |       |                                                                                       |
+| bitwise_xor   |       |                                                                                       |
+| invert        |       |                                                                                       |
+| left_shift    |       |                                                                                       |
+| right_shift   |       |                                                                                       |
+| greater       | ✅    | ✅                                                                                    |
+| greater_equal | ✅    | ✅                                                                                    |
+| less          | ✅    | ✅                                                                                    |
+| less_equal    | ✅    | ✅                                                                                    |
+| not_equal     | ✅    | ✅                                                                                    |
+| equal         | ✅    | ✅                                                                                    |
+| logical_and   |       |                                                                                       |
+| logical_or    |       |                                                                                       |
+| logical_xor   |       |                                                                                       |
+| logical_not   |       |                                                                                       |
+| maximum       | ✅    | ✅                                                                                    |
+| minimum       | ✅    | ✅                                                                                    |
+| fmax          |       |                                                                                       |
+| fmin          |       |                                                                                       |
+| isfinite      |       |                                                                                       |
+| isinf         |       |                                                                                       |
+| isnan         |       |                                                                                       |
+| isnat         |       |                                                                                       |
+| signbit       |       |                                                                                       |
+| copysign      |       |                                                                                       |
+| nextafter     |       |                                                                                       |
+| spacing       |       |                                                                                       |
+| modf          |       |                                                                                       |
+| ldexp         |       |                                                                                       |
+| frexp         |       |                                                                                       |
+| floor         | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±0.0/halfway values)_                     |
+| ceil          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±0.0/halfway values)_                     |
+| trunc         | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±0.0/fractional values)_                  |
+
+\* **Edge Cases Tested**: Indicates whether the ufunc has parametrized tests that compare QuadPrecision results against `float` and `np.float64` for edge cases including:
+
+- Special values: `0.0`, `-0.0`, `inf`, `-inf`, `nan`, `-nan`
+- For trigonometric functions: Critical points like `0`, `π/2`, `π`, `3π/2`, `2π`, values in `[0, 2π]`
+- For logarithmic functions: Values near `0`, `1`, large values
+- For exponential functions: Large positive/negative values, values near `0`
+
+**Testing Status:**
+
+- ✅ = Comprehensive edge case tests exist in `test_quaddtype.py` with parametrized tests against float64
+- 🟡 = Good basic testing exists but missing some edge cases (specific missing tests noted in italics)
+- ❌ = Ufunc is implemented but lacks systematic testing (required tests noted in italics)
+- (blank) = Ufunc not yet implemented (implementation needed first)

From 573eb76b6656cd67fb6797edb069bfaf11ca9218 Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Sun, 20 Jul 2025 20:52:31 +0530
Subject: [PATCH 47/49] more edge tests

---
 quaddtype/release_tracker.md | 182 +++++++--------
 quaddtype/tests/test_dot.py  | 432 +++++++++++++++++++++++++++++++++++
 2 files changed, 523 insertions(+), 91 deletions(-)

diff --git a/quaddtype/release_tracker.md b/quaddtype/release_tracker.md
index 4891685..fe398e6 100644
--- a/quaddtype/release_tracker.md
+++ b/quaddtype/release_tracker.md
@@ -1,96 +1,96 @@
 # Plan for `numpy-quaddtype` v1.0.0
 
-| ufunc name    | Added | Edge Cases Tested\*                                                                   |
-| ------------- | ----- | ------------------------------------------------------------------------------------- |
-| add           | ✅    | ✅                                                                                    |
-| subtract      | ✅    | ✅                                                                                    |
-| multiply      | ✅    | ✅                                                                                    |
-| matmul        | #116  | 🟡 _Need: special values (NaN/inf/-0.0), degenerate cases (0×n, 1×1), extreme values_ |
-| divide        | ✅    | ✅                                                                                    |
-| logaddexp     |       |                                                                                       |
-| logaddexp2    |       |                                                                                       |
-| true_divide   |       |                                                                                       |
-| floor_divide  |       |                                                                                       |
-| negative      | ✅    | ✅                                                                                    |
-| positive      | ✅    | ✅                                                                                    |
-| power         | ✅    | ✅                                                                                    |
-| float_power   |       |                                                                                       |
-| remainder     |       |                                                                                       |
-| mod           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/-0.0/large values)_                       |
-| fmod          |       |                                                                                       |
-| divmod        |       |                                                                                       |
-| absolute      | ✅    | ✅                                                                                    |
-| fabs          |       |                                                                                       |
-| rint          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±0.0/halfway cases)_                      |
-| sign          |       |                                                                                       |
-| heaviside     |       |                                                                                       |
-| conj          |       |                                                                                       |
-| conjugate     |       |                                                                                       |
-| exp           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/large +/- values/overflow)_               |
-| exp2          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/large +/- values/overflow)_               |
-| log           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/-values/1)_                             |
-| log2          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/-values/1)_                             |
-| log10         | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/-values/1)_                             |
-| expm1         |       |                                                                                       |
-| log1p         | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/-1/small values)_                         |
-| sqrt          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/-values)_                               |
-| square        | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/large values)_                          |
-| cbrt          |       |                                                                                       |
-| reciprocal    |       |                                                                                       |
-| gcd           |       |                                                                                       |
-| lcm           |       |                                                                                       |
-| sin           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/π multiples/2π range)_                  |
-| cos           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/π multiples/2π range)_                  |
-| tan           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/π/2 asymptotes)_                        |
-| arcsin        | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±1/out-of-domain)_                        |
-| arccos        | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±1/out-of-domain)_                        |
-| arctan        | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/asymptotes)_                            |
-| arctan2       | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/quadrant coverage)_                     |
-| hypot         |       |                                                                                       |
-| sinh          |       |                                                                                       |
-| cosh          |       |                                                                                       |
-| tanh          |       |                                                                                       |
-| arcsinh       |       |                                                                                       |
-| arccosh       |       |                                                                                       |
-| arctanh       |       |                                                                                       |
-| degrees       |       |                                                                                       |
-| radians       |       |                                                                                       |
-| deg2rad       |       |                                                                                       |
-| rad2deg       |       |                                                                                       |
-| bitwise_and   |       |                                                                                       |
-| bitwise_or    |       |                                                                                       |
-| bitwise_xor   |       |                                                                                       |
-| invert        |       |                                                                                       |
-| left_shift    |       |                                                                                       |
-| right_shift   |       |                                                                                       |
-| greater       | ✅    | ✅                                                                                    |
-| greater_equal | ✅    | ✅                                                                                    |
-| less          | ✅    | ✅                                                                                    |
-| less_equal    | ✅    | ✅                                                                                    |
-| not_equal     | ✅    | ✅                                                                                    |
-| equal         | ✅    | ✅                                                                                    |
-| logical_and   |       |                                                                                       |
-| logical_or    |       |                                                                                       |
-| logical_xor   |       |                                                                                       |
-| logical_not   |       |                                                                                       |
-| maximum       | ✅    | ✅                                                                                    |
-| minimum       | ✅    | ✅                                                                                    |
-| fmax          |       |                                                                                       |
-| fmin          |       |                                                                                       |
-| isfinite      |       |                                                                                       |
-| isinf         |       |                                                                                       |
-| isnan         |       |                                                                                       |
-| isnat         |       |                                                                                       |
-| signbit       |       |                                                                                       |
-| copysign      |       |                                                                                       |
-| nextafter     |       |                                                                                       |
-| spacing       |       |                                                                                       |
-| modf          |       |                                                                                       |
-| ldexp         |       |                                                                                       |
-| frexp         |       |                                                                                       |
-| floor         | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±0.0/halfway values)_                     |
-| ceil          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±0.0/halfway values)_                     |
-| trunc         | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±0.0/fractional values)_                  |
+| ufunc name    | Added | Edge Cases Tested\*                                                     |
+| ------------- | ----- | ----------------------------------------------------------------------- |
+| add           | ✅    | ✅                                                                      |
+| subtract      | ✅    | ✅                                                                      |
+| multiply      | ✅    | ✅                                                                      |
+| matmul        | #116  | ✅                                                                      |
+| divide        | ✅    | ✅                                                                      |
+| logaddexp     |       |                                                                         |
+| logaddexp2    |       |                                                                         |
+| true_divide   |       |                                                                         |
+| floor_divide  |       |                                                                         |
+| negative      | ✅    | ✅                                                                      |
+| positive      | ✅    | ✅                                                                      |
+| power         | ✅    | ✅                                                                      |
+| float_power   |       |                                                                         |
+| remainder     |       |                                                                         |
+| mod           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/-0.0/large values)_         |
+| fmod          |       |                                                                         |
+| divmod        |       |                                                                         |
+| absolute      | ✅    | ✅                                                                      |
+| fabs          |       |                                                                         |
+| rint          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±0.0/halfway cases)_        |
+| sign          |       |                                                                         |
+| heaviside     |       |                                                                         |
+| conj          |       |                                                                         |
+| conjugate     |       |                                                                         |
+| exp           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/large +/- values/overflow)_ |
+| exp2          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/large +/- values/overflow)_ |
+| log           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/-values/1)_               |
+| log2          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/-values/1)_               |
+| log10         | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/-values/1)_               |
+| expm1         |       |                                                                         |
+| log1p         | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/-1/small values)_           |
+| sqrt          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/-values)_                 |
+| square        | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/large values)_            |
+| cbrt          |       |                                                                         |
+| reciprocal    |       |                                                                         |
+| gcd           |       |                                                                         |
+| lcm           |       |                                                                         |
+| sin           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/π multiples/2π range)_    |
+| cos           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/π multiples/2π range)_    |
+| tan           | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/π/2 asymptotes)_          |
+| arcsin        | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±1/out-of-domain)_          |
+| arccos        | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±1/out-of-domain)_          |
+| arctan        | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/asymptotes)_              |
+| arctan2       | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/0/quadrant coverage)_       |
+| hypot         |       |                                                                         |
+| sinh          |       |                                                                         |
+| cosh          |       |                                                                         |
+| tanh          |       |                                                                         |
+| arcsinh       |       |                                                                         |
+| arccosh       |       |                                                                         |
+| arctanh       |       |                                                                         |
+| degrees       |       |                                                                         |
+| radians       |       |                                                                         |
+| deg2rad       |       |                                                                         |
+| rad2deg       |       |                                                                         |
+| bitwise_and   |       |                                                                         |
+| bitwise_or    |       |                                                                         |
+| bitwise_xor   |       |                                                                         |
+| invert        |       |                                                                         |
+| left_shift    |       |                                                                         |
+| right_shift   |       |                                                                         |
+| greater       | ✅    | ✅                                                                      |
+| greater_equal | ✅    | ✅                                                                      |
+| less          | ✅    | ✅                                                                      |
+| less_equal    | ✅    | ✅                                                                      |
+| not_equal     | ✅    | ✅                                                                      |
+| equal         | ✅    | ✅                                                                      |
+| logical_and   |       |                                                                         |
+| logical_or    |       |                                                                         |
+| logical_xor   |       |                                                                         |
+| logical_not   |       |                                                                         |
+| maximum       | ✅    | ✅                                                                      |
+| minimum       | ✅    | ✅                                                                      |
+| fmax          |       |                                                                         |
+| fmin          |       |                                                                         |
+| isfinite      |       |                                                                         |
+| isinf         |       |                                                                         |
+| isnan         |       |                                                                         |
+| isnat         |       |                                                                         |
+| signbit       |       |                                                                         |
+| copysign      |       |                                                                         |
+| nextafter     |       |                                                                         |
+| spacing       |       |                                                                         |
+| modf          |       |                                                                         |
+| ldexp         |       |                                                                         |
+| frexp         |       |                                                                         |
+| floor         | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±0.0/halfway values)_       |
+| ceil          | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±0.0/halfway values)_       |
+| trunc         | ✅    | ❌ _Need: basic tests + edge cases (NaN/inf/±0.0/fractional values)_    |
 
 \* **Edge Cases Tested**: Indicates whether the ufunc has parametrized tests that compare QuadPrecision results against `float` and `np.float64` for edge cases including:
 
diff --git a/quaddtype/tests/test_dot.py b/quaddtype/tests/test_dot.py
index 9256f3d..31d64ce 100644
--- a/quaddtype/tests/test_dot.py
+++ b/quaddtype/tests/test_dot.py
@@ -63,6 +63,47 @@ def create_quad_array(values, shape=None):
     raise ValueError("Unsupported values or shape")
 
 
+def is_special_value(val):
+    """Check if a value is NaN or infinite"""
+    try:
+        float_val = float(val)
+        return np.isnan(float_val) or np.isinf(float_val)
+    except:
+        return False
+
+
+def arrays_equal_with_nan(a, b, rtol=1e-15, atol=1e-15):
+    """Compare arrays that may contain NaN values"""
+    if a.shape != b.shape:
+        return False
+    
+    flat_a = a.flatten()
+    flat_b = b.flatten()
+    
+    for i, (val_a, val_b) in enumerate(zip(flat_a, flat_b)):
+        # Handle NaN cases
+        if is_special_value(val_a) and is_special_value(val_b):
+            float_a = float(val_a)
+            float_b = float(val_b)
+            # Both NaN
+            if np.isnan(float_a) and np.isnan(float_b):
+                continue
+            # Both infinite with same sign
+            elif np.isinf(float_a) and np.isinf(float_b) and np.sign(float_a) == np.sign(float_b):
+                continue
+            else:
+                return False
+        elif is_special_value(val_a) or is_special_value(val_b):
+            return False
+        else:
+            try:
+                assert_quad_equal(val_a, val_b, rtol, atol)
+            except AssertionError:
+                return False
+    
+    return True
+
+
 # ================================================================================
 # VECTOR-VECTOR DOT PRODUCT TESTS
 # ================================================================================
@@ -255,6 +296,397 @@ def test_associativity(self):
         assert_quad_array_equal(result1, result2, rtol=1e-25)
 
 
+# ================================================================================
+# SPECIAL VALUES EDGE CASE TESTS
+# ================================================================================
+
+class TestSpecialValueEdgeCases:
+    """Test matmul with special IEEE 754 values (NaN, inf, -0.0)"""
+    
+    @pytest.mark.parametrize("special_val", ["0.0", "-0.0", "inf", "-inf", "nan", "-nan"])
+    def test_vector_with_special_values(self, special_val):
+        """Test vectors containing special values"""
+        # Create vectors with special values
+        x = create_quad_array([1.0, float(special_val), 2.0])
+        y = create_quad_array([3.0, 4.0, 5.0])
+        
+        result = np.matmul(x, y)
+        
+        # Compare with float64 reference
+        x_float = np.array([1.0, float(special_val), 2.0], dtype=np.float64)
+        y_float = np.array([3.0, 4.0, 5.0], dtype=np.float64)
+        expected = np.matmul(x_float, y_float)
+        
+        # Handle special value comparisons
+        if np.isnan(expected):
+            assert np.isnan(float(result))
+        elif np.isinf(expected):
+            assert np.isinf(float(result))
+            assert np.sign(float(result)) == np.sign(expected)
+        else:
+            assert_quad_equal(result, expected)
+    
+    @pytest.mark.parametrize("special_val", ["0.0", "-0.0", "inf", "-inf", "nan"])
+    def test_matrix_vector_with_special_values(self, special_val):
+        """Test matrix-vector multiplication with special values"""
+        # Matrix with special value
+        A = create_quad_array([1.0, float(special_val), 3.0, 4.0], shape=(2, 2))
+        x = create_quad_array([2.0, 1.0])
+        
+        result = np.matmul(A, x)
+        
+        # Compare with float64 reference  
+        A_float = np.array([[1.0, float(special_val)], [3.0, 4.0]], dtype=np.float64)
+        x_float = np.array([2.0, 1.0], dtype=np.float64)
+        expected = np.matmul(A_float, x_float)
+        
+        assert result.shape == expected.shape
+        for i in range(len(expected)):
+            if np.isnan(expected[i]):
+                assert np.isnan(float(result[i]))
+            elif np.isinf(expected[i]):
+                assert np.isinf(float(result[i]))
+                assert np.sign(float(result[i])) == np.sign(expected[i])
+            else:
+                assert_quad_equal(result[i], expected[i])
+    
+    @pytest.mark.parametrize("special_val", ["0.0", "-0.0", "inf", "-inf", "nan"])
+    def test_matrix_matrix_with_special_values(self, special_val):
+        """Test matrix-matrix multiplication with special values"""
+        A = create_quad_array([1.0, 2.0, float(special_val), 4.0], shape=(2, 2))
+        B = create_quad_array([5.0, 6.0, 7.0, 8.0], shape=(2, 2))
+        
+        result = np.matmul(A, B)
+        
+        # Compare with float64 reference
+        A_float = np.array([[1.0, 2.0], [float(special_val), 4.0]], dtype=np.float64)
+        B_float = np.array([[5.0, 6.0], [7.0, 8.0]], dtype=np.float64)
+        expected = np.matmul(A_float, B_float)
+        
+        assert result.shape == expected.shape
+        assert arrays_equal_with_nan(result, expected)
+    
+    def test_all_nan_matrix(self):
+        """Test matrices filled with NaN"""
+        A = create_quad_array([float('nan')] * 4, shape=(2, 2))
+        B = create_quad_array([1, 2, 3, 4], shape=(2, 2))
+        
+        result = np.matmul(A, B)
+        
+        # Result should be all NaN (NaN * anything = NaN)
+        for i in range(2):
+            for j in range(2):
+                assert np.isnan(float(result[i, j]))
+    
+    def test_inf_times_zero_produces_nan(self):
+        """Test that Inf * 0 correctly produces NaN per IEEE 754"""
+        # Create a scenario where Inf * 0 occurs in matrix multiplication
+        A = create_quad_array([float('inf'), 1.0], shape=(1, 2))
+        B = create_quad_array([0.0, 1.0], shape=(2, 1))
+        
+        result = np.matmul(A, B)
+        
+        # Result should be inf*0 + 1*1 = NaN + 1 = NaN
+        assert np.isnan(float(result[0, 0])), "Inf * 0 should produce NaN per IEEE 754"
+    
+    def test_nan_propagation(self):
+        """Test that NaN properly propagates through matrix operations"""
+        A = create_quad_array([1.0, float('nan'), 3.0, 4.0], shape=(2, 2))
+        B = create_quad_array([1.0, 0.0, 0.0, 1.0], shape=(2, 2))  # Identity
+        
+        result = np.matmul(A, B)
+        
+        # C[0,0] = 1*1 + nan*0 = 1 + nan = nan (nan*0 = nan, not like inf*0)
+        # C[0,1] = 1*0 + nan*1 = 0 + nan = nan  
+        # C[1,0] = 3*1 + 4*0 = 3 + 0 = 3
+        # C[1,1] = 3*0 + 4*1 = 0 + 4 = 4
+        assert np.isnan(float(result[0, 0]))
+        assert np.isnan(float(result[0, 1]))
+        assert_quad_equal(result[1, 0], 3.0)
+        assert_quad_equal(result[1, 1], 4.0)
+    
+    def test_zero_division_and_indeterminate_forms(self):
+        """Test handling of indeterminate forms in matrix operations"""
+        # Test various indeterminate forms that should produce NaN
+        
+        # Case: Inf - Inf form
+        A = create_quad_array([float('inf'), float('inf')], shape=(1, 2))
+        B = create_quad_array([1.0, -1.0], shape=(2, 1))
+        
+        result = np.matmul(A, B)
+        
+        # Result should be inf*1 + inf*(-1) = inf - inf = NaN
+        assert np.isnan(float(result[0, 0])), "Inf - Inf should produce NaN per IEEE 754"
+    
+    def test_mixed_inf_values(self):
+        """Test matrices with mixed infinite values"""
+        # Use all-ones matrix to avoid Inf * 0 = NaN issues
+        A = create_quad_array([float('inf'), 2, float('-inf'), 3], shape=(2, 2))
+        B = create_quad_array([1, 1, 1, 1], shape=(2, 2))  # All ones to avoid Inf*0
+        
+        result = np.matmul(A, B)
+        
+        # C[0,0] = inf*1 + 2*1 = inf + 2 = inf
+        # C[0,1] = inf*1 + 2*1 = inf + 2 = inf  
+        # C[1,0] = -inf*1 + 3*1 = -inf + 3 = -inf
+        # C[1,1] = -inf*1 + 3*1 = -inf + 3 = -inf
+        assert np.isinf(float(result[0, 0])) and float(result[0, 0]) > 0
+        assert np.isinf(float(result[0, 1])) and float(result[0, 1]) > 0
+        assert np.isinf(float(result[1, 0])) and float(result[1, 0]) < 0  
+        assert np.isinf(float(result[1, 1])) and float(result[1, 1]) < 0
+
+
+# ================================================================================
+# DEGENERATE AND EMPTY CASE TESTS
+# ================================================================================
+
+class TestDegenerateCases:
+    """Test edge cases with degenerate dimensions"""
+    
+    def test_single_element_matrices(self):
+        """Test 1x1 matrix operations"""
+        A = create_quad_array([3.0], shape=(1, 1))
+        B = create_quad_array([4.0], shape=(1, 1))
+        
+        result = np.matmul(A, B)
+        
+        assert result.shape == (1, 1)
+        assert_quad_equal(result[0, 0], 12.0)
+    
+    def test_single_element_vector(self):
+        """Test operations with single-element vectors"""
+        x = create_quad_array([5.0])
+        y = create_quad_array([7.0])
+        
+        result = np.matmul(x, y)
+        
+        assert isinstance(result, QuadPrecision)
+        assert_quad_equal(result, 35.0)
+    
+    def test_very_tall_matrix(self):
+        """Test very tall matrices (1000x1)"""
+        size = 1000
+        A = create_quad_array([1.0] * size, shape=(size, 1))
+        B = create_quad_array([2.0], shape=(1, 1))
+        
+        result = np.matmul(A, B)
+        
+        assert result.shape == (size, 1)
+        for i in range(min(10, size)):  # Check first 10 elements
+            assert_quad_equal(result[i, 0], 2.0)
+    
+    def test_very_wide_matrix(self):
+        """Test very wide matrices (1x1000)"""
+        size = 1000
+        A = create_quad_array([1.0], shape=(1, 1))  
+        B = create_quad_array([3.0] * size, shape=(1, size))
+        
+        result = np.matmul(A, B)
+        
+        assert result.shape == (1, size)
+        for i in range(min(10, size)):  # Check first 10 elements
+            assert_quad_equal(result[0, i], 3.0)
+    
+    def test_zero_matrices(self):
+        """Test matrices filled with zeros"""
+        A = create_quad_array([0.0] * 9, shape=(3, 3))
+        B = create_quad_array([1, 2, 3, 4, 5, 6, 7, 8, 9], shape=(3, 3))
+        
+        result = np.matmul(A, B)
+        
+        assert result.shape == (3, 3)
+        for i in range(3):
+            for j in range(3):
+                assert_quad_equal(result[i, j], 0.0)
+    
+    def test_repeated_row_matrix(self):
+        """Test matrices with repeated rows"""
+        # Matrix with all rows the same
+        A = create_quad_array([1, 2, 3] * 3, shape=(3, 3))  # Each row is [1, 2, 3]
+        B = create_quad_array([1, 0, 0, 0, 1, 0, 0, 0, 1], shape=(3, 3))  # Identity
+        
+        result = np.matmul(A, B)
+        
+        # Result should have all rows equal to [1, 2, 3]
+        for i in range(3):
+            assert_quad_equal(result[i, 0], 1.0)
+            assert_quad_equal(result[i, 1], 2.0)
+            assert_quad_equal(result[i, 2], 3.0)
+    
+    def test_repeated_column_matrix(self):
+        """Test matrices with repeated columns"""
+        A = create_quad_array([1, 0, 0, 0, 1, 0, 0, 0, 1], shape=(3, 3))  # Identity
+        B = create_quad_array([2, 2, 2, 3, 3, 3, 4, 4, 4], shape=(3, 3))  # Each column repeated
+        
+        result = np.matmul(A, B)
+        
+        # Result should be same as B (identity multiplication)
+        assert_quad_array_equal(result, B)
+
+
+# ================================================================================
+# NUMERICAL STABILITY AND PRECISION TESTS
+# ================================================================================
+
+class TestNumericalStability:
+    """Test numerical stability with extreme values"""
+    
+    def test_very_large_values(self):
+        """Test matrices with very large values"""
+        large_val = 1e100
+        A = create_quad_array([large_val, 1, 1, large_val], shape=(2, 2))
+        B = create_quad_array([1, 0, 0, 1], shape=(2, 2))  # Identity
+        
+        result = np.matmul(A, B)
+        
+        # Should preserve large values without overflow
+        assert_quad_equal(result[0, 0], large_val)
+        assert_quad_equal(result[1, 1], large_val)
+        assert not np.isinf(float(result[0, 0]))
+        assert not np.isinf(float(result[1, 1]))
+    
+    def test_very_small_values(self):
+        """Test matrices with very small values"""
+        small_val = 1e-100
+        A = create_quad_array([small_val, 0, 0, small_val], shape=(2, 2))
+        B = create_quad_array([1, 0, 0, 1], shape=(2, 2))  # Identity
+        
+        result = np.matmul(A, B)
+        
+        # Should preserve small values without underflow
+        assert_quad_equal(result[0, 0], small_val)
+        assert_quad_equal(result[1, 1], small_val)
+        assert float(result[0, 0]) != 0.0
+        assert float(result[1, 1]) != 0.0
+    
+    def test_mixed_scale_values(self):
+        """Test matrices with mixed magnitude values"""
+        A = create_quad_array([1e100, 1e-100, 1e50, 1e-50], shape=(2, 2))
+        B = create_quad_array([1, 0, 0, 1], shape=(2, 2))  # Identity
+        
+        result = np.matmul(A, B)
+        
+        # All values should be preserved accurately
+        assert_quad_equal(result[0, 0], 1e100)
+        assert_quad_equal(result[0, 1], 1e-100)
+        assert_quad_equal(result[1, 0], 1e50)
+        assert_quad_equal(result[1, 1], 1e-50)
+    
+    def test_precision_critical_case(self):
+        """Test case that would lose precision in double"""
+        # Create a case where large values cancel in the dot product
+        # Vector: [1e20, 1.0, -1e20] dot [1, 0, 1] should equal 1.0
+        x = create_quad_array([1e20, 1.0, -1e20])
+        y = create_quad_array([1.0, 0.0, 1.0])
+        
+        result = np.matmul(x, y)
+        
+        # The result should be 1e20*1 + 1.0*0 + (-1e20)*1 = 1e20 - 1e20 = 0, but we want 1
+        # Let me fix this: [1e20, 1.0, -1e20] dot [0, 1, 0] = 1.0
+        x = create_quad_array([1e20, 1.0, -1e20])
+        y = create_quad_array([0.0, 1.0, 0.0])
+        
+        result = np.matmul(x, y)
+        
+        # This would likely fail in double precision due to representation issues
+        assert_quad_equal(result, 1.0, atol=1e-25)
+    
+    def test_condition_number_extreme(self):
+        """Test matrices with extreme condition numbers"""
+        # Nearly singular matrix (very small determinant)
+        eps = 1e-50
+        A = create_quad_array([1, 1, 1, 1+eps], shape=(2, 2))
+        B = create_quad_array([1, 0, 0, 1], shape=(2, 2))
+        
+        result = np.matmul(A, B)
+        
+        # Result should be computed accurately
+        assert_quad_equal(result[0, 0], 1.0)
+        assert_quad_equal(result[0, 1], 1.0)
+        assert_quad_equal(result[1, 0], 1.0)
+        assert_quad_equal(result[1, 1], 1.0 + eps)
+    
+    def test_accumulation_precision(self):
+        """Test precision in accumulation of many terms"""
+        size = 100
+        # Create vectors where each term contributes equally
+        x_vals = [1.0 / size] * size
+        y_vals = [1.0] * size
+        
+        x = create_quad_array(x_vals)
+        y = create_quad_array(y_vals)
+        
+        result = np.matmul(x, y)
+        
+        # Result should be exactly 1.0 
+        assert_quad_equal(result, 1.0, atol=1e-25)
+
+
+# ================================================================================
+# CROSS-VALIDATION TESTS
+# ================================================================================
+
+class TestCrossValidation:
+    """Test consistency with float64 reference implementations"""
+    
+    @pytest.mark.parametrize("size", [2, 3, 5, 10])
+    def test_consistency_with_float64_vectors(self, size):
+        """Test vector operations consistency with float64"""
+        # Use values well within float64 range
+        x_vals = [i + 0.5 for i in range(size)]
+        y_vals = [2 * i + 1.5 for i in range(size)]
+        
+        # QuadPrecision computation
+        x_quad = create_quad_array(x_vals)
+        y_quad = create_quad_array(y_vals)
+        result_quad = np.matmul(x_quad, y_quad)
+        
+        # float64 reference
+        x_float = np.array(x_vals, dtype=np.float64)
+        y_float = np.array(y_vals, dtype=np.float64)
+        result_float = np.matmul(x_float, y_float)
+        
+        # Results should match within float64 precision
+        assert_quad_equal(result_quad, result_float, rtol=1e-14)
+    
+    @pytest.mark.parametrize("m,n,k", [(2,2,2), (3,3,3), (4,5,6)])
+    def test_consistency_with_float64_matrices(self, m, n, k):
+        """Test matrix operations consistency with float64"""
+        # Create test matrices with float64-representable values
+        A_vals = [(i + j + 1) * 0.25 for i in range(m) for j in range(k)]
+        B_vals = [(i * 2 + j) * 0.125 for i in range(k) for j in range(n)]
+        
+        # QuadPrecision computation
+        A_quad = create_quad_array(A_vals, shape=(m, k))
+        B_quad = create_quad_array(B_vals, shape=(k, n))
+        result_quad = np.matmul(A_quad, B_quad)
+        
+        # float64 reference
+        A_float = np.array(A_vals, dtype=np.float64).reshape(m, k)
+        B_float = np.array(B_vals, dtype=np.float64).reshape(k, n)
+        result_float = np.matmul(A_float, B_float)
+        
+        # Results should match within float64 precision
+        for i in range(m):
+            for j in range(n):
+                assert_quad_equal(result_quad[i, j], result_float[i, j], rtol=1e-14)
+    
+    def test_quad_precision_advantage(self):
+        """Test cases where quad precision shows advantage over float64"""
+        A = create_quad_array([1.0, 1e-30], shape=(1, 2))
+        B = create_quad_array([1.0, 1.0], shape=(2, 1))
+        
+        result_quad = np.matmul(A, B)
+        
+        # The result should be 1.0 + 1e-30 = 1.0000000000000000000000000000001
+        expected = 1.0 + 1e-30
+        assert_quad_equal(result_quad[0, 0], expected, rtol=1e-25)
+        
+        # Verify that this value is actually different from 1.0 in quad precision
+        diff = result_quad[0, 0] - 1.0
+        assert abs(diff) > 0  # Should be non-zero in quad precision
+
+
 # ================================================================================
 # LARGE MATRIX TESTS
 # ================================================================================

From c795ef38545a4a52ce971f5b38293f348a3fc49f Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Tue, 22 Jul 2025 17:06:40 +0000
Subject: [PATCH 48/49] adding windows instructions and some small refactor as
 per reviews

---
 quaddtype/README.md                           |  98 +++++++++++++++-
 quaddtype/numpy_quaddtype/__init__.py         |   2 -
 .../numpy_quaddtype/src/umath/matmul.cpp      |  13 ++-
 quaddtype/tests/test_dot.py                   | 108 +-----------------
 quaddtype/tests/test_utils.py                 |  98 ++++++++++++++++
 5 files changed, 202 insertions(+), 117 deletions(-)
 create mode 100644 quaddtype/tests/test_utils.py

diff --git a/quaddtype/README.md b/quaddtype/README.md
index 11eef1d..3c58645 100644
--- a/quaddtype/README.md
+++ b/quaddtype/README.md
@@ -25,11 +25,11 @@ np.array([1,2,3], dtype=QuadPrecDType("longdouble"))
 
 ## Installation from source
 
-The code needs the quad precision pieces of the sleef library, which
-is not available on most systems by default, so we have to generate
-that first. The below assumes one has the required pieces to build
-sleef (cmake and libmpfr-dev), and that one is in the package
-directory locally.
+The code needs the quad precision pieces of the sleef library, which is not available on most systems by default, so we have to generate that first. Choose the appropriate section below based on your operating system.
+
+### Linux/Unix/macOS
+
+The below assumes one has the required pieces to build sleef (cmake and libmpfr-dev), and that one is in the package directory locally.
 
 ```bash
 git clone --branch 3.8 https://github.com/shibatch/sleef.git
@@ -68,3 +68,91 @@ python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args
 cd ..
 python -m pytest
 ```
+
+### Windows
+
+#### Prerequisites
+
+- **Visual Studio 2017 or later** (with MSVC compiler)
+- **CMake** (≥3.15)
+- **Python 3.10+** 
+- **Git**
+
+#### Step-by-Step Installation
+
+1. **Setup Development Environment**
+
+   Open a **Developer Command Prompt for VS** or **Developer PowerShell for VS** to ensure MSVC is properly configured.
+
+2. **Clone and Build SLEEF**
+
+   ```powershell
+   # Clone SLEEF library
+   git clone --branch 3.8 https://github.com/shibatch/sleef.git
+   cd sleef
+
+   # Configure with CMake for Windows
+   cmake -S . -B build -G "Visual Studio 17 2022" -A x64 -DSLEEF_BUILD_QUAD:BOOL=ON -DSLEEF_BUILD_SHARED_LIBS:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+
+   # Build and install SLEEF
+   cmake --build build --config Release
+   cmake --install build --prefix "C:/sleef" --config Release
+
+   cd ..
+   ```
+
+3. **Setup Python Environment**
+
+   ```powershell
+   # Create and activate virtual environment
+   python -m venv numpy_quad_env
+   .\numpy_quad_env\Scripts\Activate.ps1
+
+   # Install build dependencies
+   pip install -U pip
+   pip install meson-python numpy pytest ninja meson
+   ```
+
+4. **Set Environment Variables**
+
+   ```powershell
+   # Set up paths and compiler flags
+   $env:INCLUDE = "C:/sleef/include;$env:INCLUDE"
+   $env:LIB = "C:/sleef/lib;$env:LIB"
+   $env:PATH = "C:/sleef/bin;$env:PATH"
+
+   # Note: QBLAS is disabled on Windows due to MSVC compatibility issues
+   $env:CFLAGS = "/IC:/sleef/include /DDISABLE_QUADBLAS"
+   $env:CXXFLAGS = "/IC:/sleef/include /DDISABLE_QUADBLAS"
+   $env:LDFLAGS = "C:/sleef/lib/sleef.lib C:/sleef/lib/sleefquad.lib"
+   ```
+
+5. **Build and Install numpy-quaddtype**
+
+   ```powershell
+   # Ensure submodules are initialized
+   git submodule update --init --recursive
+
+   # Build and install the package
+   python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v'
+   ```
+
+6. **Test Installation**
+
+   ```powershell
+   # Run tests
+   pytest -s tests/
+   ```
+
+1. **QBLAS Disabled**: QuadBLAS optimization is automatically disabled on Windows builds due to MSVC compatibility issues. This is handled by the `-DDISABLE_QUADBLAS` compiler flag.
+
+2. **Visual Studio Version**: The instructions assume Visual Studio 2022. For other versions, adjust the generator string:
+   - VS 2019: `"Visual Studio 16 2019"`
+   - VS 2017: `"Visual Studio 15 2017"`
+
+3. **Architecture**: The instructions are for x64. For x86 builds, change `-A x64` to `-A Win32`.
+
+4. **Alternative SLEEF Location**: If you prefer to install SLEEF elsewhere, update all path references accordingly.
+
+#### Windows Troubleshooting
+- **Link errors**: Verify that `sleef.lib` and `sleefquad.lib` exist in `C:/sleef/lib/`
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/__init__.py b/quaddtype/numpy_quaddtype/__init__.py
index 878180b..8da0a76 100644
--- a/quaddtype/numpy_quaddtype/__init__.py
+++ b/quaddtype/numpy_quaddtype/__init__.py
@@ -8,8 +8,6 @@
     get_quadblas_version
 )
 
-import multiprocessing
-
 __all__ = [
     'QuadPrecision', 'QuadPrecDType', 'SleefQuadPrecision', 'LongDoubleQuadPrecision',
     'SleefQuadPrecDType', 'LongDoubleQuadPrecDType', 'is_longdouble_128', 
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
index 354a342..6ac0d5c 100644
--- a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
+++ b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -36,7 +36,7 @@ quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[
     if (descr_in1->backend != BACKEND_SLEEF || descr_in2->backend != BACKEND_SLEEF) {
         PyErr_SetString(PyExc_NotImplementedError,
                         "QBLAS-accelerated matmul only supports SLEEF backend. "
-                        "Other backends are not supported with QBLAS.");
+                        "Please raise the issue at SwayamInSync/QBLAS for longdouble support");
         return (NPY_CASTING)-1;
     }
 
@@ -61,7 +61,8 @@ quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[
         QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2];
         if (descr_out->backend != target_backend) {
             PyErr_SetString(PyExc_NotImplementedError,
-                            "QBLAS-accelerated matmul only supports SLEEF backend for output.");
+                        "QBLAS-accelerated matmul only supports SLEEF backend. "
+                        "Please raise the issue at SwayamInSync/QBLAS for longdouble support");
             return (NPY_CASTING)-1;
         }
         else {
@@ -118,7 +119,9 @@ quad_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const dat
 
     QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
     if (descr->backend != BACKEND_SLEEF) {
-        PyErr_SetString(PyExc_RuntimeError, "Internal error: non-SLEEF backend in QBLAS matmul");
+        PyErr_SetString(PyExc_NotImplementedError,
+                        "QBLAS-accelerated matmul only supports SLEEF backend. "
+                        "Please raise the issue at SwayamInSync/QBLAS for longdouble support");
         return -1;
     }
 
@@ -206,7 +209,9 @@ quad_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const d
 
     QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
     if (descr->backend != BACKEND_SLEEF) {
-        PyErr_SetString(PyExc_RuntimeError, "Internal error: non-SLEEF backend in QBLAS matmul");
+        PyErr_SetString(PyExc_NotImplementedError,
+                        "QBLAS-accelerated matmul only supports SLEEF backend. "
+                        "Please raise the issue at SwayamInSync/QBLAS for longdouble support");
         return -1;
     }
 
diff --git a/quaddtype/tests/test_dot.py b/quaddtype/tests/test_dot.py
index 31d64ce..b8428b8 100644
--- a/quaddtype/tests/test_dot.py
+++ b/quaddtype/tests/test_dot.py
@@ -1,109 +1,9 @@
 import pytest
 import numpy as np
+from test_utils import create_quad_array, assert_quad_equal, assert_quad_array_equal, arrays_equal_with_nan
 from numpy_quaddtype import QuadPrecision, QuadPrecDType
 
 
-# ================================================================================
-# UTILITIES
-# ================================================================================
-
-def assert_quad_equal(a, b, rtol=1e-15, atol=1e-15):
-    """Assert two quad precision values are equal within tolerance"""
-    # Ensure both operands are QuadPrecision objects for the comparison
-    if not isinstance(a, QuadPrecision):
-        a = QuadPrecision(str(a), backend='sleef')
-    if not isinstance(b, QuadPrecision):
-        b = QuadPrecision(str(b), backend='sleef')
-
-    # Use quad-precision arithmetic to calculate the difference
-    diff = abs(a - b)
-    tolerance = QuadPrecision(str(atol), backend='sleef') + QuadPrecision(str(rtol), backend='sleef') * max(abs(a), abs(b))
-    
-    # Assert using quad-precision objects
-    assert diff <= tolerance, f"Values not equal: {a} != {b} (diff: {diff}, tol: {tolerance})"
-
-
-def assert_quad_array_equal(a, b, rtol=1e-25, atol=1e-25):
-    """Assert two quad precision arrays are equal within tolerance"""
-    assert a.shape == b.shape, f"Shapes don't match: {a.shape} vs {b.shape}"
-    
-    flat_a = a.flatten()
-    flat_b = b.flatten()
-    
-    for i, (val_a, val_b) in enumerate(zip(flat_a, flat_b)):
-        try:
-            assert_quad_equal(val_a, val_b, rtol, atol)
-        except AssertionError as e:
-            raise AssertionError(f"Arrays differ at index {i}: {e}")
-
-
-def create_quad_array(values, shape=None):
-    """Create a QuadPrecision array from values using Sleef backend"""
-    dtype = QuadPrecDType(backend='sleef')
-    
-    if isinstance(values, (list, tuple)):
-        if shape is None:
-            # 1D array
-            quad_values = [QuadPrecision(str(float(v)), backend='sleef') for v in values]
-            return np.array(quad_values, dtype=dtype)
-        else:
-            # Reshape to specified shape
-            if len(shape) == 1:
-                quad_values = [QuadPrecision(str(float(v)), backend='sleef') for v in values]
-                return np.array(quad_values, dtype=dtype)
-            elif len(shape) == 2:
-                m, n = shape
-                assert len(values) == m * n, f"Values length {len(values)} doesn't match shape {shape}"
-                quad_matrix = []
-                for i in range(m):
-                    row = [QuadPrecision(str(float(values[i * n + j])), backend='sleef') for j in range(n)]
-                    quad_matrix.append(row)
-                return np.array(quad_matrix, dtype=dtype)
-    
-    raise ValueError("Unsupported values or shape")
-
-
-def is_special_value(val):
-    """Check if a value is NaN or infinite"""
-    try:
-        float_val = float(val)
-        return np.isnan(float_val) or np.isinf(float_val)
-    except:
-        return False
-
-
-def arrays_equal_with_nan(a, b, rtol=1e-15, atol=1e-15):
-    """Compare arrays that may contain NaN values"""
-    if a.shape != b.shape:
-        return False
-    
-    flat_a = a.flatten()
-    flat_b = b.flatten()
-    
-    for i, (val_a, val_b) in enumerate(zip(flat_a, flat_b)):
-        # Handle NaN cases
-        if is_special_value(val_a) and is_special_value(val_b):
-            float_a = float(val_a)
-            float_b = float(val_b)
-            # Both NaN
-            if np.isnan(float_a) and np.isnan(float_b):
-                continue
-            # Both infinite with same sign
-            elif np.isinf(float_a) and np.isinf(float_b) and np.sign(float_a) == np.sign(float_b):
-                continue
-            else:
-                return False
-        elif is_special_value(val_a) or is_special_value(val_b):
-            return False
-        else:
-            try:
-                assert_quad_equal(val_a, val_b, rtol, atol)
-            except AssertionError:
-                return False
-    
-    return True
-
-
 # ================================================================================
 # VECTOR-VECTOR DOT PRODUCT TESTS
 # ================================================================================
@@ -789,8 +689,4 @@ def test_dimension_mismatch_matrices(self):
         B = create_quad_array([1, 2, 3, 4, 5, 6], shape=(3, 2))  # Wrong size
         
         with pytest.raises(ValueError, match=r"matmul: Input operand 1 has a mismatch in its core dimension 0"):
-            np.matmul(A, B)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
\ No newline at end of file
+            np.matmul(A, B)
\ No newline at end of file
diff --git a/quaddtype/tests/test_utils.py b/quaddtype/tests/test_utils.py
new file mode 100644
index 0000000..dcdbc67
--- /dev/null
+++ b/quaddtype/tests/test_utils.py
@@ -0,0 +1,98 @@
+from numpy_quaddtype import QuadPrecision, QuadPrecDType
+import numpy as np
+
+def assert_quad_equal(a, b, rtol=1e-15, atol=1e-15):
+    """Assert two quad precision values are equal within tolerance"""
+    # Ensure both operands are QuadPrecision objects for the comparison
+    if not isinstance(a, QuadPrecision):
+        a = QuadPrecision(str(a), backend='sleef')
+    if not isinstance(b, QuadPrecision):
+        b = QuadPrecision(str(b), backend='sleef')
+
+    # Use quad-precision arithmetic to calculate the difference
+    diff = abs(a - b)
+    tolerance = QuadPrecision(str(atol), backend='sleef') + QuadPrecision(str(rtol), backend='sleef') * max(abs(a), abs(b))
+    
+    # Assert using quad-precision objects
+    assert diff <= tolerance, f"Values not equal: {a} != {b} (diff: {diff}, tol: {tolerance})"
+
+
+def assert_quad_array_equal(a, b, rtol=1e-25, atol=1e-25):
+    """Assert two quad precision arrays are equal within tolerance"""
+    assert a.shape == b.shape, f"Shapes don't match: {a.shape} vs {b.shape}"
+    
+    flat_a = a.flatten()
+    flat_b = b.flatten()
+    
+    for i, (val_a, val_b) in enumerate(zip(flat_a, flat_b)):
+        try:
+            assert_quad_equal(val_a, val_b, rtol, atol)
+        except AssertionError as e:
+            raise AssertionError(f"Arrays differ at index {i}: {e}")
+
+
+def create_quad_array(values, shape=None):
+    """Create a QuadPrecision array from values using Sleef backend"""
+    dtype = QuadPrecDType(backend='sleef')
+    
+    if isinstance(values, (list, tuple)):
+        if shape is None:
+            # 1D array
+            quad_values = [QuadPrecision(str(float(v)), backend='sleef') for v in values]
+            return np.array(quad_values, dtype=dtype)
+        else:
+            # Reshape to specified shape
+            if len(shape) == 1:
+                quad_values = [QuadPrecision(str(float(v)), backend='sleef') for v in values]
+                return np.array(quad_values, dtype=dtype)
+            elif len(shape) == 2:
+                m, n = shape
+                assert len(values) == m * n, f"Values length {len(values)} doesn't match shape {shape}"
+                quad_matrix = []
+                for i in range(m):
+                    row = [QuadPrecision(str(float(values[i * n + j])), backend='sleef') for j in range(n)]
+                    quad_matrix.append(row)
+                return np.array(quad_matrix, dtype=dtype)
+    
+    raise ValueError("Unsupported values or shape")
+
+
+def is_special_value(val):
+    """Check if a value is NaN or infinite"""
+    try:
+        float_val = float(val)
+        return np.isnan(float_val) or np.isinf(float_val)
+    except:
+        return False
+
+
+def arrays_equal_with_nan(a, b, rtol=1e-15, atol=1e-15):
+    """Compare arrays that may contain NaN values"""
+    if a.shape != b.shape:
+        return False
+    
+    flat_a = a.flatten()
+    flat_b = b.flatten()
+    
+    for i, (val_a, val_b) in enumerate(zip(flat_a, flat_b)):
+        # Handle NaN cases
+        if is_special_value(val_a) and is_special_value(val_b):
+            float_a = float(val_a)
+            float_b = float(val_b)
+            # Both NaN
+            if np.isnan(float_a) and np.isnan(float_b):
+                continue
+            # Both infinite with same sign
+            elif np.isinf(float_a) and np.isinf(float_b) and np.sign(float_a) == np.sign(float_b):
+                continue
+            else:
+                return False
+        elif is_special_value(val_a) or is_special_value(val_b):
+            return False
+        else:
+            try:
+                assert_quad_equal(val_a, val_b, rtol, atol)
+            except AssertionError:
+                return False
+    
+    return True
\ No newline at end of file

From 07a16c0d9f66dbb547361cdf5bd83eb1f754082a Mon Sep 17 00:00:00 2001
From: swayaminsync <hawkempire007@gmail.com>
Date: Wed, 23 Jul 2025 01:04:40 +0530
Subject: [PATCH 49/49] rename utils

---
 quaddtype/tests/test_dot.py                 | 2 +-
 quaddtype/tests/{test_utils.py => utils.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename quaddtype/tests/{test_utils.py => utils.py} (100%)

diff --git a/quaddtype/tests/test_dot.py b/quaddtype/tests/test_dot.py
index b8428b8..b1c16c6 100644
--- a/quaddtype/tests/test_dot.py
+++ b/quaddtype/tests/test_dot.py
@@ -1,6 +1,6 @@
 import pytest
 import numpy as np
-from test_utils import create_quad_array, assert_quad_equal, assert_quad_array_equal, arrays_equal_with_nan
+from utils import create_quad_array, assert_quad_equal, assert_quad_array_equal, arrays_equal_with_nan
 from numpy_quaddtype import QuadPrecision, QuadPrecDType
 
 
diff --git a/quaddtype/tests/test_utils.py b/quaddtype/tests/utils.py
similarity index 100%
rename from quaddtype/tests/test_utils.py
rename to quaddtype/tests/utils.py