Tensorium_lib/Matrix_8hpp_source.html

#pragma once


#include "../MathUtils/MathsUtils.hpp"

#include "../SIMD/Allocator.hpp"

#include "../SIMD/CPU_id.hpp"

#include "../SIMD/SIMD.hpp"

#include "MatrixKernels/GemmKernel_bigger.hpp"

#include "Vector.hpp"

#include <cassert>

#include <cmath>

#include <immintrin.h>

#include <iostream>

#include <vector>


namespace tensorium {


template <typename K, bool RowMajor = false> class Matrix {

  public:

    size_t            rows, cols;

    aligned_vector<K> data;

    size_t            block_size;

    bool              iscolumn;


    Matrix(size_t r, size_t c)

        : rows(r),

          cols(c),

          data(r * c, K()),

          block_size(detect_optimal_block_size()) {}


    inline size_t index(size_t i, size_t j) const {

        if constexpr (RowMajor)

            return i * cols + j;

        else

            return j * rows + i;

    }


    using Simd = simd::SimdTraits<K, DefaultISA>;

    using reg = typename Simd::reg;

    size_t simd_width = Simd::width;

    size_t size() const { return rows * cols; }

    K &operator()(size_t i, size_t j) { return data[index(i, j)]; }


    const K &operator()(size_t i, size_t j) const { return data[index(i, j)]; }


    void print() const {

        for (size_t i = 0; i < rows; ++i) {

            std::cout << "[ ";

            for (size_t j = 0; j < cols; ++j)

                std::cout << operator()(i, j) << " ";

            std::cout << "]\n";

        }

    }


    void swap_rows(size_t i, size_t j) {

        assert(i < rows && j < rows);

        for (size_t k = 0; k < cols; ++k) {

            MathsUtils::_swap((*this)(i, k), (*this)(j, k));

        }

    }


    template <typename T> Vector<T> operator*(const Vector<T> &v) const {

        assert(cols == v.size() && "Matrix-Vector size mismatch");

        Vector<T> result(rows);

        for (auto &x : result)

            x = T(0);


        for (size_t i = 0; i < rows; ++i) {

            for (size_t j = 0; j < cols; ++j) {

                result[i] += (*this)(i, j) * v[j];

            }

        }

        return result;

    }


    inline void add(const Matrix &m) {

        if (rows != m.rows || cols != m.cols)

            throw std::invalid_argument("Matrix sizes do not match");


        using Simd = simd::SimdTraits<K, DefaultISA>;

        using reg = typename Simd::reg;

        const size_t simd_width = Simd::width;


        size_t n = size();

        size_t i = 0;


        _mm_prefetch((const char *)&m.data[0], _MM_HINT_T0);


        for (; i + 2 * simd_width - 1 < n; i += 2 * simd_width) {

            reg a0 = Simd::load(&data[i]);

            reg b0 = Simd::load(&m.data[i]);

            a0 = Simd::add(a0, b0);

            Simd::store(&data[i], a0);


            reg a1 = Simd::load(&data[i + simd_width]);

            reg b1 = Simd::load(&m.data[i + simd_width]);

            a1 = Simd::add(a1, b1);

            Simd::store(&data[i + simd_width], a1);

        }


        for (; i < n; ++i)

            data[i] += m.data[i];

    }


    inline void sub(const Matrix &m) {

        if (rows != m.rows || cols != m.cols)

            throw std::invalid_argument("Matrix sizes do not match");

        using Simd = simd::SimdTraits<K, DefaultISA>;

        using reg = typename Simd::reg;

        const size_t simd_width = Simd::width;


        size_t n = size();

        size_t i = 0;


        _mm_prefetch((const char *)&m.data[0], _MM_HINT_T0);

        for (; i + 15 < n; i += 16) {

            reg a0 = Simd::load(&data[i]);

            reg b0 = Simd::load(&m.data[i]);

            a0 = Simd::sub(a0, b0);

            Simd::store(&data[i], a0);


            reg a1 = Simd::load(&data[i + simd_width]);

            reg b1 = Simd::load(&m.data[i + simd_width]);

            a1 = Simd::sub(a1, b1);

            Simd::store(&data[i + simd_width], a1);

        }

        for (; i < size(); ++i) {

            data[i] -= m.data[i];

        }

    }


    inline void scl(K a) {

        size_t n = size();

        size_t i = 0;

        using Simd = simd::SimdTraits<K, DefaultISA>;

        using reg = typename Simd::reg;

        const size_t simd_width = Simd::width;

        _mm_prefetch((const char *)&data[0], _MM_HINT_T0);

        reg scalar = Simd::set1(a);


        for (; i + 15 < n; i += 16) {

            reg v0 = Simd::load(&data[i]);

            v0 = Simd::mul(v0, scalar);

            Simd::store(&data[i], v0);


            reg v1 = Simd::load(&data[i + simd_width]);

            v1 = Simd::mul(v1, scalar);

            Simd::store(&data[i + simd_width], v1);

        }


        for (; i < n; ++i)

            data[i] *= a;

    }


    inline void lerp(const Matrix<K> &A, const Matrix<K> &B, K alpha) {

        if (A.rows != B.rows || A.cols != B.cols || rows != A.rows || cols != A.cols)

            throw std::invalid_argument("Matrix size mismatch for lerp");


        using Simd = simd::SimdTraits<K, DefaultISA>;

        using reg = typename Simd::reg;

        const size_t simd_width = Simd::width;


        size_t n = size();

        size_t i = 0;


        reg alpha_vec = Simd::set1(alpha);

        reg one_minus_alpha_vec = Simd::set1(K(1) - alpha);


        for (; i + 2 * simd_width - 1 < n; i += 2 * simd_width) {

            reg a0 = Simd::load(&A.data[i]);

            reg b0 = Simd::load(&B.data[i]);

            reg r0 = Simd::fmadd(one_minus_alpha_vec, a0, Simd::mul(alpha_vec, b0));

            Simd::store(&data[i], r0);


            reg a1 = Simd::load(&A.data[i + simd_width]);

            reg b1 = Simd::load(&B.data[i + simd_width]);

            reg r1 = Simd::fmadd(one_minus_alpha_vec, a1, Simd::mul(alpha_vec, b1));

            Simd::store(&data[i + simd_width], r1);

        }


        for (; i < n; ++i) {

            data[i] = (K(1) - alpha) * A.data[i] + alpha * B.data[i];

        }

    }


    inline Matrix _mul_mat(const Matrix<K> &mat) const {

        if (cols != mat.rows)

            throw std::invalid_argument("Matrix dimensions do not match for multiplication");


        Matrix<K> result(rows, mat.cols);


        const K *A = data.data();        // Already column-major (this)

        const K *B = mat.data.data();    // Already column-major (rhs)

        K       *C = result.data.data(); // Output (also column-major)


        tensorium::GemmKernelBigger<K> kernel;

        kernel.matmul(const_cast<K *>(A), const_cast<K *>(B), C,

                      static_cast<int>(rows),     // M

                      static_cast<int>(mat.cols), // N

                      static_cast<int>(cols)      // K

        );


        return result;

    }


    template <typename T> inline Vector<T> mul_vec(const Vector<T> &x) const {

        using Simd = simd::SimdTraits<T, DefaultISA>;

        using reg = typename Simd::reg;

        constexpr size_t W = Simd::width;


        assert(cols == x.size());


        Vector<T> result(rows, T(0));


        alignas(64) T buffer[W];


        for (size_t i = 0; i < rows; ++i) {

            reg    acc = Simd::zero();

            size_t j = 0;


            for (; j + W <= cols; j += W) {

                for (size_t w = 0; w < W; ++w)

                    buffer[w] = (*this)(i, j + w);


                reg A_vec = Simd::load(buffer);

                reg x_vec = Simd::load(&x[j]);

                acc = Simd::fmadd(A_vec, x_vec, acc);

            }


            T sum = Simd::horizontal_add(acc);


            for (; j < cols; ++j)

                sum += (*this)(i, j) * x[j];


            result[i] = sum;

        }


        return result;

    }


    inline Matrix<K> transpose() const {

        Matrix<K> result(cols, rows);


        for (size_t i = 0; i < rows; ++i)

            for (size_t j = 0; j < cols; ++j)

                result(j, i) = (*this)(i, j);


        return result;

    }


    inline Matrix<K> trace() const {

        if (rows != cols) {

            throw std::invalid_argument("Matrix is not square");

        }


        Matrix<K> result(1, 1);

        result(0, 0) = K(0);


        for (size_t i = 0; i < rows; ++i) {

            result(0, 0) += operator()(i, i);

        }


        return result;

    }


    inline Matrix<K> inverse() const {

        if (rows != cols)

            throw std::invalid_argument("Matrix must be square");


        const auto n = rows;

        Matrix<K>  M(n, n);

        Matrix<K>  Inv(n, n);


        for (auto i = decltype(n)(0); i < n; ++i) {

            for (auto j = decltype(n)(0); j < n; ++j) {

                M(i, j) = operator()(i, j);

                Inv(i, j) = (i == j) ? K(1) : K(0);

            }

        }


        using SimdT = simd::SimdTraits<K, DefaultISA>;

        for (auto i = decltype(n)(0); i < n; ++i) {

            auto piv = i;

            auto maxv = MathsUtils::_abs(M(i, i));

            for (auto r = i + 1; r < n; ++r) {

                auto v = MathsUtils::_abs(M(r, i));

                if (v > maxv) {

                    maxv = v;

                    piv = r;

                }

            }

            if (maxv < static_cast<K>(1e-6))

                throw std::runtime_error("Matrix is singular or nearly singular.");


            if (piv != i) {

                M.swap_rows(i, piv);

                Inv.swap_rows(i, piv);

            }


            auto diag = M(i, i);

            auto diag_inv = K(1) / diag;

            for (auto j = 0u; j < n; ++j) {

                M(i, j) *= diag_inv;

                Inv(i, j) *= diag_inv;

            }


#pragma omp parallel for schedule(dynamic, UNROLL)

            for (auto j = 0u; j < n; ++j) {

                if (j != i) {

                    auto f = M(j, i);

                    for (auto k = 0u; k < n; ++k) {

                        M(j, k) -= f * M(i, k);

                        Inv(j, k) -= f * Inv(i, k);

                    }

                }

            }

        }


        return Inv;

    }


    inline K det() const {

        if (rows != cols)

            throw std::invalid_argument("Matrix must be square");


        const size_t n = rows;

        Matrix<K>    M(n, n);

        using SimdT = simd::SimdTraits<K, DefaultISA>;

        const size_t simd_width = SimdT::width;


        for (size_t i = 0; i < n; ++i)

            for (size_t j = 0; j < n; ++j)

                M(i, j) = operator()(i, j);


        K det_sign = K(1);


        for (size_t i = 0; i < n; ++i) {

            size_t piv = i;

            auto   maxv = MathsUtils::_abs(M(i, i));

            for (size_t r = i + 1; r < n; ++r) {

                auto v = MathsUtils::_abs(M(r, i));

                if (v > maxv) {

                    maxv = v;

                    piv = r;

                }

            }

            if (maxv < static_cast<K>(1e-12))

                return K(0);


            if (piv != i) {

                M.swap_rows(i, piv);

                det_sign = -det_sign;

            }


            for (size_t j = i + 1; j < n; ++j) {

                auto f = M(j, i) / M(i, i);

                M(j, i) = K(0);


                auto   f_vec = SimdT::set1(-f);

                size_t k = i + 1;


                for (; k + simd_width - 1 < n; k += simd_width) {

                    auto mjk = SimdT::load(&M(j, k));

                    auto mik = SimdT::load(&M(i, k));

                    mjk = SimdT::fmadd(f_vec, mik, mjk);

                    SimdT::store(&M(j, k), mjk);

                }

                for (; k < n; ++k) {

                    M(j, k) -= f * M(i, k);

                }

            }

        }


        K det = det_sign;

        for (size_t i = 0; i < n; ++i)

            det *= M(i, i);


        return det;

    }


    inline size_t rank(K eps = K(1e-6)) const {

        Matrix<K>    M(*this);

        const size_t m = rows;

        const size_t n = cols;

        size_t       r = 0;


        for (size_t col = 0; col < n; ++col) {

            size_t pivot_row = r;

            for (size_t i = r; i < m; ++i) {

                if (MathsUtils::_abs(M(i, col)) > MathsUtils::_abs(M(pivot_row, col)))

                    pivot_row = i;

            }


            if (MathsUtils::_abs(M(pivot_row, col)) <= eps)

                continue;


            if (pivot_row != r)

                M.swap_rows(pivot_row, r);


            for (size_t i = r + 1; i < m; ++i) {

                auto f = M(i, col) / M(r, col);

                M(i, col) = 0;

                for (size_t j = col + 1; j < n; ++j)

                    M(i, j) -= f * M(r, j);

            }


            ++r;

        }


        return r;

    }


};


} // namespace tensorium

Allocator.hpp

aligned_vector
std::vector< K, AlignedAllocator< K, ALIGN > > aligned_vector
Type alias for a std::vector with aligned memory allocation.
Definition Allocator.hpp:111

CPU_id.hpp

detect_optimal_block_size
size_t detect_optimal_block_size()
Definition CPU_id.hpp:18

GemmKernel_bigger.hpp

GreekSymbolminus::alpha
@ alpha

MathsUtils.hpp

SIMD.hpp

Vector.hpp

MathsUtils::_swap
static void _swap(T &a, T &b)
Definition MathsUtils.hpp:26

MathsUtils::_abs
static double _abs(double a)
Definition MathsUtils.hpp:32

tensorium::GemmKernelBigger
Definition GemmKernel_bigger.hpp:16

tensorium::GemmKernelBigger::matmul
void matmul(T *A, T *B, T *C, int M, int N, int K)
Definition GemmKernel_bigger.hpp:828

tensorium::Matrix
High-performance aligned matrix class with SIMD support.
Definition Matrix.hpp:27

tensorium::Matrix::transpose
Matrix< K > transpose() const
Returns the transpose  of the matrix (column-major layout)
Definition Matrix.hpp:272

tensorium::Matrix::operator()
const K & operator()(size_t i, size_t j) const
Definition Matrix.hpp:57

tensorium::Matrix::Matrix
Matrix(size_t r, size_t c)
Construct a matrix of size r × c, initialized with zeros.
Definition Matrix.hpp:36

tensorium::Matrix::_mul_mat
Matrix _mul_mat(const Matrix< K > &mat) const
Multiply matrix by another matrix using optimized SIMD path.
Definition Matrix.hpp:212

tensorium::Matrix::rows
size_t rows
Definition Matrix.hpp:29

tensorium::Matrix::rank
size_t rank(K eps=K(1e-6)) const
Compute the numerical rank of the matrix.
Definition Matrix.hpp:428

tensorium::Matrix::sub
void sub(const Matrix &m)
In-place matrix subtraction: this -= m.
Definition Matrix.hpp:125

tensorium::Matrix::operator*
Vector< T > operator*(const Vector< T > &v) const
Multiply matrix by a vector (naïve fallback)
Definition Matrix.hpp:82

tensorium::Matrix::scl
void scl(K a)
In-place scalar multiplication: this *= a.
Definition Matrix.hpp:152

tensorium::Matrix::lerp
void lerp(const Matrix< K > &A, const Matrix< K > &B, K alpha)
Linearly interpolate between two matrices: this = (1 - α) * A + α * B.
Definition Matrix.hpp:176

tensorium::Matrix::mul_vec
Vector< T > mul_vec(const Vector< T > &x) const
Multiply matrix by a vector using SIMD.
Definition Matrix.hpp:236

tensorium::Matrix::simd_width
size_t simd_width
Definition Matrix.hpp:50

tensorium::Matrix::det
K det() const
Compute the determinant using Gaussian elimination.
Definition Matrix.hpp:365

tensorium::Matrix::iscolumn
bool iscolumn
Definition Matrix.hpp:32

tensorium::Matrix::trace
Matrix< K > trace() const
Returns the trace of a square matrix as a 1×1 matrix.
Definition Matrix.hpp:283

tensorium::Matrix::print
void print() const
Print the matrix to stdout.
Definition Matrix.hpp:60

tensorium::Matrix::block_size
size_t block_size
Definition Matrix.hpp:31

tensorium::Matrix::operator()
K & operator()(size_t i, size_t j)
Element access (mutable)
Definition Matrix.hpp:55

tensorium::Matrix::data
aligned_vector< K > data
Definition Matrix.hpp:30

tensorium::Matrix::index
size_t index(size_t i, size_t j) const
Definition Matrix.hpp:42

tensorium::Matrix::reg
typename Simd::reg reg
Definition Matrix.hpp:49

tensorium::Matrix::add
void add(const Matrix &m)
In-place matrix addition: this += m.
Definition Matrix.hpp:96

tensorium::Matrix::cols
size_t cols
Definition Matrix.hpp:29

tensorium::Matrix::inverse
Matrix< K > inverse() const
Compute the inverse of the matrix using Gauss–Jordan elimination.
Definition Matrix.hpp:303

tensorium::Matrix::swap_rows
void swap_rows(size_t i, size_t j)
Swap two rows of the matrix.
Definition Matrix.hpp:69

tensorium::Matrix::size
size_t size() const
Return the total number of elements.
Definition Matrix.hpp:52

tensorium::Vector
Aligned, SIMD-optimized mathematical vector class for scientific computing.
Definition Vector.hpp:26

tensorium::Vector::size
size_t size() const
Definition Vector.hpp:76

tensorium
Definition Derivate.hpp:24

simd::SimdTraits
Definition SIMD.hpp:177