diff --git a/src/FsMath/Matrix.fs b/src/FsMath/Matrix.fs
index 62b2f23..280bfc2 100644
--- a/src/FsMath/Matrix.fs
+++ b/src/FsMath/Matrix.fs
@@ -141,43 +141,72 @@ type Matrix<'T when 'T :> Numerics.INumber<'T>
this.GetSlice(rowStart, rowEnd, colStart, colEnd)
- /// Creates a new matrix by initializing each element with a function `f(row, col)`.
+ ///
+ /// Transposes a matrix using cache-friendly blocked algorithm with loop unrolling.
+ /// The block size is chosen adaptively based on element type for optimal cache utilization.
+ /// Within each block, uses loop unrolling to reduce loop overhead and improve instruction-level parallelism.
+ ///
static member inline private transposeByBlock<'T when 'T :> Numerics.INumber<'T>
and 'T : (new: unit -> 'T)
and 'T : struct
and 'T :> ValueType>
- (rows : int)
+ (rows : int)
(cols : int)
- (data: 'T[])
+ (data: 'T[])
(blockSize: int) =
- //let blockSize = defaultArg blockSize 16
-
let src = data
let dst = Array.zeroCreate<'T> (rows * cols)
- let vectorSize = Numerics.Vector<'T>.Count
-
- // Process the matrix in blocks
+ // Process the matrix in blocks for cache efficiency
for i0 in 0 .. blockSize .. rows - 1 do
for j0 in 0 .. blockSize .. cols - 1 do
-
let iMax = min (i0 + blockSize) rows
let jMax = min (j0 + blockSize) cols
+ // Within each block, unroll the innermost loop by 4
for i in i0 .. iMax - 1 do
- let srcOffset = i * cols
- for j in j0 .. jMax - 1 do
- let v = src.[srcOffset + j]
+ let mutable j = j0
+ let srcRowOffset = i * cols
+
+ // Unrolled loop: process 4 columns at a time
+ while j + 3 < jMax do
+ let v0 = src.[srcRowOffset + j]
+ let v1 = src.[srcRowOffset + j + 1]
+ let v2 = src.[srcRowOffset + j + 2]
+ let v3 = src.[srcRowOffset + j + 3]
+
+ dst.[j * rows + i] <- v0
+ dst.[(j + 1) * rows + i] <- v1
+ dst.[(j + 2) * rows + i] <- v2
+ dst.[(j + 3) * rows + i] <- v3
+
+ j <- j + 4
+
+ // Handle remaining columns
+ while j < jMax do
+ let v = src.[srcRowOffset + j]
dst.[j * rows + i] <- v
+ j <- j + 1
dst
static member inline transpose (m:Matrix<'T>) : Matrix<'T> =
m.Transpose()
+ ///
+ /// Transposes this matrix (rows become columns, columns become rows).
+ /// Uses an adaptive block size based on element type for optimal cache performance.
+ ///
member this.Transpose() =
- let blocksize = 16
+ // Adaptive block size based on element type
+ // Larger elements (float64) benefit from smaller blocks to fit in L1 cache
+ // Smaller elements (float32, int) can use larger blocks
+ let blocksize =
+ match sizeof<'T> with
+ | 4 -> 32 // float32 or int32: 32x32 block = 4KB fits in L1
+ | 8 -> 16 // float64: 16x16 block = 2KB fits in L1
+ | _ -> 16 // fallback for other types
Matrix(this.NumCols, this.NumRows, Matrix.transposeByBlock this.NumRows this.NumCols this.Data blocksize)
static member init<'T when 'T :> Numerics.INumber<'T>