Sync public subset from Flux

2025-10-06 20:21:40 +00:00
parent b2d00af0e1
commit 8892d58e66
15 changed files with 1825 additions and 0 deletions
--- a/assets/logo/logo_1600.png
+++ b/assets/logo/logo_1600.png
--- a/169
+++ b/169
@@ -0,0 +1,169 @@
 # Compiler and flags
 CC := g++
 CXXFLAGS := -std=c++14 -Wall -Iinclude -O3 -march=native -fopenmp
 LDFLAGS  := -fopenmp
 # Compiles .h files when there's been a change
 DEPFLAGS := -MMD -MP
 CXX ?= g++
 # Directories
 SRC_DIR := src
 INC_DIR := include
 OBJ_DIR := obj
 BIN_DIR := bin
 TEST_BIN := $(BIN_DIR)/tests
 #
 # === Executable Name ===
 TARGET := $(BIN_DIR)/abc_lab
 # === Find all .cpp files recursively in src/ ===
 SRCS := $(shell find $(SRC_DIR) -name '*.cpp')
 # === Convert src/foo/bar.cpp → obj/foo/bar.o ===
 OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(SRCS))
 # === Test sources ===
 TEST_SRCS := $(shell find test -name 'test_*.cpp')
 TEST_OBJS := $(patsubst test/%.cpp, $(OBJ_DIR)/test/%.o, $(TEST_SRCS))
 # Compiles .h files when there's been a change
 DEPS := $(OBJS:.o=.d) $(TEST_OBJS:.o=.d) $(TEST_MAIN:.o=.d)
 -include $(DEPS)
 # The single file that defines TEST_MAIN / main()
 TEST_MAIN := $(OBJ_DIR)/test/test_all.o
 # === OpenMP runtime configuration (override-able) ===
 OMP_PROC_BIND   ?= close        # close|spread|master
 OMP_PLACES      ?= cores        # cores|threads|sockets
 OMP_MAX_LEVELS  ?= 1            # 1 = no nested teams; set 2+ to allow nesting
 OMP_THREADS     ?= 16           # e.g. "16" or "8,4" for nested (outer,inner)
 OMP_DYNAMIC     ?= TRUE         # TRUE/FALSE: let runtime adjust threads
 OMP_SCHEDULE    ?= STATIC     	# STATIC recommended for matvec/matmul
 OMP_DISPLAY_ENV ?= FALSE        # TRUE to print runtime config at startup
 # Export OMP defaults so child makes or tools see them (not strictly required)
 export OMP_PROC_BIND
 export OMP_PLACES
 export OMP_MAX_LEVELS
 export OMP_THREADS
 export OMP_DYNAMIC
 export OMP_SCHEDULE
 export OMP_DISPLAY_ENV
 # What belongs to "run" vs "test"
 RUN_DEPS := $(OBJS:.o=.d)
 TEST_DEPS := $(TEST_OBJS:.o=.d) $(TEST_MAIN:.o=.d)
 RUN_ARTIFACTS  := $(TARGET) $(OBJS) $(RUN_DEPS)
 TEST_ARTIFACTS := $(TEST_BIN) $(TEST_OBJS) $(TEST_MAIN) $(TEST_DEPS)
 # === Default Target ===
 all: $(TARGET)
 # === Linking final executable ===
 $(TARGET): $(OBJS)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS)
 # === Compiling source files to object files ===
 $(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(DEPFLAGS) -c $< -o $@
 # === Run with OpenMP env set only for the run ===
 .PHONY: run
 run: clean-test $(TARGET)
 	@echo ">>> OMP_PROC_BIND=$(OMP_PROC_BIND)"
 	@echo ">>> OMP_PLACES=$(OMP_PLACES)"
 	@echo ">>> OMP_MAX_ACTIVE_LEVELS=$(OMP_MAX_LEVELS)"
 	@echo ">>> OMP_NUM_THREADS=$(OMP_THREADS)"
 	@echo ">>> OMP_DYNAMIC=$(OMP_DYNAMIC)"
 	@echo ">>> OMP_SCHEDULE=$(OMP_SCHEDULE)"
 	@echo ">>> OMP_DISPLAY_ENV=$(OMP_DISPLAY_ENV)"
 	@OMP_PROC_BIND=$(OMP_PROC_BIND) \
 	OMP_PLACES=$(OMP_PLACES) \
 	OMP_MAX_ACTIVE_LEVELS=$(OMP_MAX_LEVELS) \
 	OMP_NUM_THREADS="$(OMP_THREADS)" \
 	OMP_DYNAMIC=$(OMP_DYNAMIC) \
 	OMP_SCHEDULE=$(OMP_SCHEDULE) \
 	OMP_DISPLAY_ENV=$(OMP_DISPLAY_ENV) \
 	./$(TARGET)
 # Handy presets
 .PHONY: run-single-core
 run-single-core: ## Single-level one core (good default)
 	$(MAKE) run OMP_MAX_LEVELS=1 OMP_THREADS=1 OMP_PROC_BIND=close OMP_PLACES=cores
 # Handy presets
 .PHONY: run-multi-core
 run-multi-core: ## Single-level parallel (good default)
 	$(MAKE) run OMP_MAX_LEVELS=1 OMP_THREADS=16 OMP_PROC_BIND=close OMP_PLACES=cores
 .PHONY: run-nested-multi-core
 run-nested-multi-core: ## Two-level nested (outer,inner), adjust to your cores
 	$(MAKE) run OMP_MAX_LEVELS=2 OMP_THREADS="2,8" OMP_PROC_BIND=close OMP_PLACES=cores
 # Optional: print debug info
 .PHONY: info
 info:
 	@echo "Source files: $(SRCS)"
 	@echo "Object files: $(OBJS)"
 	@echo "CXXFLAGS: $(CXXFLAGS)"
 	@echo "LDFLAGS:   $(LDFLAGS)"
 .PHONY: test
 test: clean-run $(TEST_BIN)
 	@echo ">>> OMP_PROC_BIND=$(OMP_PROC_BIND)"
 	@echo ">>> OMP_PLACES=$(OMP_PLACES)"
 	@echo ">>> OMP_MAX_ACTIVE_LEVELS=$(OMP_MAX_LEVELS)"
 	@echo ">>> OMP_NUM_THREADS=$(OMP_THREADS)"
 	@echo ">>> OMP_DYNAMIC=$(OMP_DYNAMIC)"
 	@echo ">>> OMP_SCHEDULE=$(OMP_SCHEDULE)"
 	@echo ">>> OMP_DISPLAY_ENV=$(OMP_DISPLAY_ENV)"
 	@OMP_PROC_BIND=$(OMP_PROC_BIND) \
 	OMP_PLACES=$(OMP_PLACES) \
 	OMP_MAX_ACTIVE_LEVELS=$(OMP_MAX_LEVELS) \
 	OMP_NUM_THREADS="$(OMP_THREADS)" \
 	OMP_DYNAMIC=$(OMP_DYNAMIC) \
 	OMP_SCHEDULE=$(OMP_SCHEDULE) \
 	OMP_DISPLAY_ENV=$(OMP_DISPLAY_ENV) \
 	$(TEST_BIN)
 $(TEST_BIN): $(TEST_OBJS) $(TEST_MAIN)
 	@mkdir -p $(BIN_DIR)
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS)
 $(OBJ_DIR)/test/%.o: test/%.cpp
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(DEPFLAGS) -c $< -o $@
 # Clean up
 .PHONY: clean
 clean:
 	rm -rf $(OBJ_DIR) $(BIN_DIR)
 .PHONY: clean-run clean-test
 clean-run:
 	@echo "Cleaning run artifacts..."
 	@rm -f $(RUN_ARTIFACTS)
 clean-test:
 	@echo "Cleaning test artifacts..."
 	@rm -f $(TEST_ARTIFACTS)
--- a/test/.gitkeep
+++ b/test/.gitkeep
--- a/test/test_all.cpp
+++ b/test/test_all.cpp
@@ -0,0 +1,2 @@
 #define TEST_MAIN
 #include "test_common.h"
--- a/test/test_common.h
+++ b/test/test_common.h
@@ -0,0 +1,52 @@
 #ifndef _test_common_n_
 #define _test_common_n_
 #pragma once
 #include <iostream>
 #include <stdexcept>
 #include <string>
 #include <vector>
 struct TestFailure : public std::runtime_error {
    using std::runtime_error::runtime_error;
 };
 #define CHECK(cond, msg) do { if (!(cond)) throw TestFailure(msg); } while (0)
 #define CHECK_EQ(a,b,msg) do { if (!((a)==(b))) { throw TestFailure(std::string(msg) + " (" #a " != " #b ")"); } } while (0)
 #define TEST_CASE(name) \
    static void name(); \
    struct name##_registrar { name##_registrar(){ TestRegistry::add(#name, &name);} } name##_registrar_instance; \
    static void name()
 struct TestRegistry {
    using Fn = void(*)();
    static std::vector<std::pair<std::string, Fn>>& list() {
        static std::vector<std::pair<std::string, Fn>> v; return v;
    }
    static void add(const std::string& name, Fn fn) { list().push_back({name, fn}); }
 };
 // Default test runner main()
 #ifdef TEST_MAIN
 int main() {
    int fails = 0;
    for (auto& t : TestRegistry::list()) {
        try {
            t.second();
            //std::cout << "[PASS] " << t.first << "\n";
        } catch (const TestFailure& e) {
            std::cerr << "[FAIL] " << t.first << " -> " << e.what() << "\n";
            ++fails;
        } catch (const std::exception& e) {
            std::cerr << "[ERROR] " << t.first << " -> " << e.what() << "\n";
            ++fails;
        }
    }
    std::cout << (fails ? "Some tests failed ❌\n" : "All tests passed ✅\n");
    return fails ? 1 : 0;
 }
 #endif
 #endif // _test_common_n_
--- a/test/test_eye.cpp
+++ b/test/test_eye.cpp
@@ -0,0 +1,155 @@
 #include "test_common.h"
 #include "./utils/matrix.h"
 #include "./numerics/initializers/eye.h"
 #include <chrono>
 // Tiny helper
 template <typename T>
 static bool is_identity(const utils::Matrix<T>& M) {
    if (M.rows() != M.cols()) return false;
    for (std::uint64_t i = 0; i < M.rows(); ++i)
        for (std::uint64_t j = 0; j < M.cols(); ++j)
            if ((i == j && M(i,j) != T{1}) ||
                (i != j && M(i,j) != T{0})) return false;
    return true;
 }
 // ============ Basic correctness ============
 TEST_CASE(Generate_EYE_double) {
    utils::Matrix<double> I = numerics::eye<double>(5);
    CHECK_EQ(I.rows(), 5, "rows should be 5");
    CHECK_EQ(I.cols(), 5, "cols should be 5");
    CHECK(is_identity(I), "eye<double>(5) is not identity");
 }
 TEST_CASE(Generate_EYE_int) {
    utils::Matrix<uint64_t> I = numerics::eye<uint64_t>(3);
    CHECK_EQ(I.rows(), 3, "rows should be 3");
    CHECK_EQ(I.cols(), 3, "cols should be 3");
    CHECK(is_identity(I), "eye<int>(3) is not identity");
 }
 TEST_CASE(Inplace_EYE_resize_creates_identity) {
    utils::Md A; // empty
    numerics::inplace_eye(A, 7); // resize to 7x7 and set identity
    CHECK_EQ(A.rows(), 7, "rows should be 7");
    CHECK_EQ(A.cols(), 7, "cols should be 7");
    CHECK(is_identity(A), "inplace_eye resize did not create identity");
 }
 TEST_CASE(Inplace_EYE_inplace_on_square) {
    utils::Md A(4,4, 42.0);  // junk values
    numerics::inplace_eye(A); // N==0 → must be square, zero + diag
    CHECK(is_identity(A), "inplace_eye on square did not produce identity");
 }
 TEST_CASE(Inplace_EYE_throws_on_non_square) {
    utils::Md A(2,3, 5.0);
    bool threw = false;
    try {
        numerics::inplace_eye(A); // N==0 and non-square → throws
    } catch (const std::runtime_error&) {
        threw = true;
    }
    CHECK(threw, "inplace_eye should throw on non-square when N==0");
 }
 // ============ OpenMP variants (compiled only when -fopenmp is used) ============
 #ifdef _OPENMP
 TEST_CASE(EYE_OMP_matches_serial) {
    utils::Md I1 = numerics::eye<double>(64);
    utils::Md I2 = numerics::eye_omp<double>(64);
    for (uint64_t i = 0; i < I1.rows(); ++i){
        for (uint64_t j = 0; j < I1.cols(); ++j){
            CHECK(I1(i,j) == I2(i,j), "eye_omp != eye");
        }
    }
    utils::Md A(64,64, 3.14);
    numerics::inplace_eye_omp(A); // N==0 → must be square
    CHECK(is_identity(A), "inplace_eye_omp did not produce identity");
 }
 TEST_CASE(EYE_OMP_speed) {
    uint64_t prev = omp_get_max_threads();
    if (prev <= 1) return;
    omp_set_num_threads(16);
    uint64_t N = 16384;
    utils::Matrix<uint8_t> I1(N,N,1), I2(N,N,1);
    auto t0 = std::chrono::high_resolution_clock::now();
    numerics::inplace_eye<uint8_t>(I1);
    double t1 = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - t0).count();
    t0 = std::chrono::high_resolution_clock::now();
    numerics::inplace_eye_omp<uint8_t>(I2);
    double t2 = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - t0).count();
    omp_set_num_threads(prev);
    CHECK(t2 <= t1, "collapse_omp: multi-thread slower than single-thread");
 }
 TEST_CASE(EYE_OMP_nested_ok) {
    // Ensure nested regions are allowed for this test (won't error if not)
    uint64_t prev_levels = omp_get_max_active_levels();
    uint64_t prev = omp_get_max_threads();
    omp_set_max_active_levels(2);
    // Outer team size (small); inner will be large when nesting is allowed
    const int outer_threads = 1;
    // Inner team size to request during nested-enabled run
    const int inner_threads = 12;
    uint64_t N = 4096*8;
    utils::Matrix<uint8_t> I1(N, N, 7);
    utils::Matrix<uint8_t> I2(N, N, 7);
    // ---------- baseline: inner runs with a small team (no outer region) ----------
    omp_set_max_active_levels(1);                    // disable nesting for baseline
    omp_set_num_threads(outer_threads);
    for (uint64_t i = 0; i < 2; ++i){
        numerics::inplace_eye_omp<uint8_t>(I1);
    }
    // ---------- nested: outer×inner (only one outer thread launches inner) ----------
        omp_set_max_active_levels(2);                    // allow one nested level
    #pragma omp parallel num_threads(outer_threads)
    {
        #pragma omp single                              // avoid racing on I1
        {
            omp_set_num_threads(inner_threads);
            for (uint64_t i = 0; i < 2; ++i){
                numerics::inplace_eye_omp<uint8_t>(I2);
            }
        }
    }
    omp_set_max_active_levels(prev_levels);                    
    omp_set_num_threads(prev);
    CHECK(is_identity(I1), "EYE_OMP_nested did not produce identity");
    CHECK(is_identity(I2), "EYE_OMP_nested did not produce identity");
 }
 TEST_CASE(EYE_OMP_auto_is_identity) {
    auto I = numerics::eye_omp_auto<double>(32);
    CHECK(is_identity(I), "eye_omp_auto did not produce identity");
 }
 #endif
--- a/test/test_interpolation1d.cpp
+++ b/test/test_interpolation1d.cpp
@@ -0,0 +1,155 @@
 #include "test_common.h"
 #include "./utils/matrix.h"
 #include "./utils/vector.h"
 #include "./numerics/interpolation1d.h"
 // ------------ helpers ------------
 template <typename T>
 static void make_uniform_xy(utils::Vector<T>& x, utils::Vector<T>& y,
                            std::uint64_t N, T x0, T dx,
                            T a, T b) {
    x.resize(N, T(0));
    y.resize(N, T(0));
    for (std::uint64_t i=0; i<N; ++i) {
        T xi = x0 + dx * T(i);
        x[i] = xi;
        y[i] = a*xi + b;     // y = a x + b
    }
 }
 template <typename T>
 static void make_uniform_xy_fun(utils::Vector<T>& x, utils::Vector<T>& y,
                                std::uint64_t N, T x0, T dx,
                                T (*f)(T)) {
    x.resize(N, T(0));
    y.resize(N, T(0));
    for (std::uint64_t i=0; i<N; ++i) {
        T xi = x0 + dx * T(i);
        x[i] = xi;
        y[i] = f(xi);
    }
 }
 template <typename T>
 static inline bool almost_eq(T a, T b, double tol=1e-12) {
    return std::fabs(double(a)-double(b)) <= tol;
 }
 // ------------ tests ------------
 // 1) All interpolants reproduce the data points exactly (or to tiny FP error)
 TEST_CASE(Interp_Reproduces_Data_Nodes) {
    const std::uint64_t N = 25;
    utils::Vd x, y;
    // linear data: y = 0.1 x + 0.9 on x = 1..25
    make_uniform_xy<double>(x, y, N, 1.0, 1.0, 0.1, 0.9);
    numerics::interp_linear<double>       lin(x,y);
    numerics::interp_polynomial<double>   pol(x,y, 3);
    numerics::interp_cubic_spline<double> spl(x,y);
    numerics::interp_rational<double>     rat(x,y, 2);
    numerics::interp_barycentric<double>  bar(x,y, 2);
    for (std::uint64_t i=0; i<N; ++i) {
        double xi = x[i];
        double yi = y[i];
        CHECK(almost_eq(lin.interp(xi), yi), "linear fails at node");
        CHECK(almost_eq(pol.interp(xi), yi), "polynomial fails at node");
        CHECK(almost_eq(spl.interp(xi), yi), "spline fails at node");
        CHECK(almost_eq(rat.interp(xi), yi), "rational fails at node");
        CHECK(almost_eq(bar.interp(xi), yi), "barycentric fails at node");
    }
 }
 // 2) Linear dataset: all interpolants match the analytic line at off-grid points
 TEST_CASE(Interp_Linear_Dataset_OffGrid) {
    const std::uint64_t N = 100;
    utils::Vd x, y;
    // your quick test dataset: x = 1..100, y = 0.1 x + 0.9
    make_uniform_xy<double>(x, y, N, 1.0, 1.0, 0.1, 0.9);
    numerics::interp_linear<double>       lin(x,y);
    numerics::interp_polynomial<double>   pol(x,y, 3);
    numerics::interp_cubic_spline<double> spl(x,y);
    numerics::interp_rational<double>     rat(x,y, 3);
    numerics::interp_barycentric<double>  bar(x,y, 3);
    std::vector<double> qs = { 1.5, 5.5, 5.51, 50.01, 99.9 };
    for (double q : qs) {
        const double ytrue = 0.1*q + 0.9;
        CHECK(almost_eq(lin.interp(q), ytrue, 1e-9), "linear off-grid mismatch");
        CHECK(almost_eq(pol.interp(q), ytrue, 1e-9), "polynomial off-grid mismatch");
        CHECK(almost_eq(spl.interp(q), ytrue, 1e-9), "spline off-grid mismatch");
        CHECK(almost_eq(rat.interp(q), ytrue, 1e-9), "rational off-grid mismatch");
        CHECK(almost_eq(bar.interp(q), ytrue, 1e-9), "barycentric off-grid mismatch");
    }
    // endpoints should match exactly (no extrapolation)
    CHECK(almost_eq(lin.interp(x[0]),   y[0]), "linear endpoint");
    CHECK(almost_eq(lin.interp(x[N-1]), y[N-1]), "linear endpoint");
    CHECK(almost_eq(spl.interp(x[0]),   y[0]), "spline endpoint");
    CHECK(almost_eq(spl.interp(x[N-1]), y[N-1]), "spline endpoint");
 }
 // 3) Quadratic dataset: degree-2 polynomial interpolation should be exact
 static double f_quad(double t) { return t*t - 3.0*t + 2.0; } // (t-1)(t-2)
 TEST_CASE(Interp_Polynomial_Deg2_Exact_On_Quadratic) {
    const std::uint64_t N = 21;
    utils::Vd x, y;
    make_uniform_xy_fun<double>(x, y, N, -2.0, 0.5, &f_quad);
    numerics::interp_polynomial<double> pol(x,y, 3);
    std::vector<double> qs = { -1.75, -0.1, 0.3, 1.4, 3.9, 7.25 };
    for (double q : qs) {
        // only test inside the data domain to avoid extrap behavior
        if (q >= x[0] && q <= x[N-1]) {
            const double ytrue = f_quad(q);
            //std::cout << pol.interp(q) << ", " << ytrue << ", "<< q <<std::endl;
            CHECK(almost_eq(pol.interp(q), ytrue, 1e-10), "degree-2 polynomial should be exact on quadratic");
        }
    }
 }
 // 4) Cross-method consistency: methods agree closely on smooth data (cosine wave)
 static double f_cos(double t) { return std::cos(0.1*t); }
 TEST_CASE(Interp_Cross_Method_Consistency) {
    const std::uint64_t N = 60;
    utils::Vd x, y;
    make_uniform_xy_fun<double>(x, y, N, 0.0, 0.5, &f_cos);
    numerics::interp_linear<double>       lin(x,y);
    numerics::interp_polynomial<double>   pol(x,y, 3);  // small local degree
    numerics::interp_cubic_spline<double> spl(x,y);
    numerics::interp_rational<double>     rat(x,y, 3);
    numerics::interp_barycentric<double>  bar(x,y, 3);
    // sample a handful of interior points and require all methods to be mutually close
    std::vector<double> qs = { 1.25, 7.75, 12.1, 18.6, 22.75 };
    for (double q : qs) {
        // skip if q is outside just in case
        if (q < x[0] || q > x[N-1]) continue;
        double yl = lin.interp(q);
        double yp = pol.interp(q);
        double ys = spl.interp(q);
        double yr = rat.interp(q);
        double yb = bar.interp(q);
        //std::cout << "lin: " << yl << std::endl;
        //std::cout << "pol: " << yp << std::endl;
        //std::cout << "spl: " << ys << std::endl;
        //std::cout << "rat: " << yr << std::endl;
        //std::cout << "bar: " << yb << std::endl;
        // spline is usually the smoothest; use it as the anchor
        CHECK(almost_eq(yl, ys, 5e-4), "linear vs spline");
        CHECK(almost_eq(yp, ys, 5e-4), "polynomial vs spline");
        CHECK(almost_eq(yr, ys, 5e-4), "rational vs spline");
        CHECK(almost_eq(yb, ys, 5e-4), "barycentric vs spline");
    }
 }
--- a/test/test_inverse.cpp
+++ b/test/test_inverse.cpp
@@ -0,0 +1,141 @@
 #include "test_common.h"
 #include "./utils/matrix.h"
 #include "./utils/vector.h"
 #include "./numerics/inverse.h"
 #include "./numerics/matmul.h"
 // ---------- helpers ----------
 template <typename T>
 static utils::Matrix<T> identity(std::uint64_t n) {
    utils::Matrix<T> I(n,n,T(0));
    for (std::uint64_t i=0;i<n;++i) I(i,i) = T(1);
    return I;
 }
 template <typename T>
 static bool mats_equal_tol(const utils::Matrix<T>& X,
                           const utils::Matrix<T>& Y,
                           double tol = 1e-12) {
    if (X.rows()!=Y.rows() || X.cols()!=Y.cols()) return false;
    for (std::uint64_t i=0;i<X.rows();++i)
        for (std::uint64_t j=0;j<X.cols();++j)
            if (std::fabs(double(X(i,j) - Y(i,j))) > tol) return false;
    return true;
 }
 // A small well-conditioned SPD 3x3
 static utils::Matrix<double> make_A3() {
    utils::Matrix<double> A(3,3,0.0);
    // [ 4  3  0
    //   3  4 -1
    //   0 -1  4 ]
    A(0,0)=4; A(0,1)=3;  A(0,2)=0;
    A(1,0)=3; A(1,1)=4;  A(1,2)=-1;
    A(2,0)=0; A(2,1)=-1; A(2,2)=4;
    return A;
 }
 // A random-ish SPD 5x5 (constructed deterministically)
 static utils::Matrix<double> make_A5_spd() {
    utils::Matrix<double> R(5,5,0.0);
    // Fill R with a simple pattern
    double v=1.0;
    for (std::uint64_t i=0;i<5;++i)
        for (std::uint64_t j=0;j<5;++j, v+=0.37)
            R(i,j) = std::fmod(v, 3.0) - 1.0;   // values in [-1,2)
    // A = R^T R + 5 I  → SPD and well-conditioned
    utils::Matrix<double> Rt(5,5,0.0);
    for (std::uint64_t i=0;i<5;++i)
        for (std::uint64_t j=0;j<5;++j)
            Rt(i,j) = R(j,i);
    auto RtR = numerics::matmul(Rt, R);
    for (std::uint64_t i=0;i<5;++i) RtR(i,i) += 5.0;
    return RtR;
 }
 // ---------- tests ----------
 TEST_CASE(Inverse_GJ_3x3) {
    auto A = make_A3();
    auto A_copy = A;
    auto Inv = numerics::inverse(A, "Gauss-Jordan");  // non-inplace
    auto I = identity<double>(3);
    auto AInv  = numerics::matmul(A, Inv);
    auto InvA  = numerics::matmul(Inv, A);
    CHECK(mats_equal_tol(AInv, I, 1e-11), "A*Inv != I (Gauss-Jordan)");
    CHECK(mats_equal_tol(InvA, I, 1e-11), "Inv*A != I (Gauss-Jordan)");
    // A should be unchanged by numerics::inverse (it copies internally)
    CHECK(mats_equal_tol(A, A_copy, 1e-15), "inverse() modified input A");
 }
 TEST_CASE(Inverse_LU_3x3) {
    auto A = make_A3();
    auto Inv = numerics::inverse(A, "LU");
    auto I = identity<double>(3);
    auto AInv  = numerics::matmul(A, Inv);
    auto InvA  = numerics::matmul(Inv, A);
    CHECK(mats_equal_tol(AInv, I, 1e-11), "A*Inv != I (LU)");
    CHECK(mats_equal_tol(InvA, I, 1e-11), "Inv*A != I (LU)");
 }
 TEST_CASE(Inplace_Inverse_Both_Methods_Agree_5x5) {
    auto A = make_A5_spd();
    auto GJ = A; numerics::inplace_inverse(GJ, "Gauss-Jordan");
    auto LU = A; numerics::inplace_inverse(LU, "LU");
    // Both should be valid inverses
    auto I = identity<double>(5);
    CHECK(mats_equal_tol(numerics::matmul(A, GJ), I, 1e-10), "A*GJ != I");
    CHECK(mats_equal_tol(numerics::matmul(GJ, A), I, 1e-10), "GJ*A != I");
    CHECK(mats_equal_tol(numerics::matmul(A, LU), I, 1e-10), "A*LU != I");
    CHECK(mats_equal_tol(numerics::matmul(LU, A), I, 1e-10), "LU*A != I");
    // And they should be very close to each other
    CHECK(mats_equal_tol(GJ, LU, 1e-10), "Gauss-Jordan inverse != LU inverse");
 }
 TEST_CASE(Inverse_NonSquare_Throws) {
    utils::Matrix<double> A(2,3,1.0);
    bool threw=false;
    try { auto B = numerics::inverse(A, "Gauss-Jordan"); (void)B; } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "inverse should throw on non-square (Gauss-Jordan)");
    threw=false;
    try { numerics::inplace_inverse(A, "LU"); } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "inplace_inverse should throw on non-square (LU)");
 }
 TEST_CASE(Inverse_Singular_Throws) {
    utils::Matrix<double> A(3,3,0.0);
    // Two identical rows → singular
    A(0,0)=1; A(0,1)=2; A(0,2)=3;
    A(1,0)=1; A(1,1)=2; A(1,2)=3;
    A(2,0)=0; A(2,1)=1; A(2,2)=4;
    bool threw=false;
    try { auto B = numerics::inverse(A, "Gauss-Jordan"); (void)B; } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "inverse(GJ) should throw on singular");
    threw=false;
    try { auto B = numerics::inverse(A, "LU"); (void)B; } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "inverse(LU) should throw on singular");
 }
 TEST_CASE(Inverse_Invalid_Method_Throws) {
    auto A = make_A3();
    bool threw=false;
    try { auto B = numerics::inverse(A, "NotAThing"); (void)B; } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "inverse should throw on unknown method");
 }
--- a/test/test_lu.cpp
+++ b/test/test_lu.cpp
@@ -0,0 +1,169 @@
 #include "test_common.h"
 #include "./utils/matrix.h"           
 #include "./utils/vector.h"           
 #include "./numerics/matmul.h"       
 #include "./numerics/matvec.h" 
 #include "./decomp/lu.h"
 //#include <chrono>
 // ---------- helpers ----------
 template <typename T>
 static bool mats_equal_tol(const utils::Matrix<T>& X,
                           const utils::Matrix<T>& Y,
                           double tol = 1e-12) {
    if (X.rows()!=Y.rows() || X.cols()!=Y.cols()) return false;
    for (std::uint64_t i=0;i<X.rows();++i)
        for (std::uint64_t j=0;j<X.cols();++j)
            if (std::fabs(double(X(i,j) - Y(i,j))) > tol) return false;
    return true;
 }
 template <typename T>
 static utils::Matrix<T> identity(std::uint64_t n) {
    utils::Matrix<T> I(n,n,T(0));
    for (std::uint64_t i=0;i<n;++i) I(i,i) = T(1);
    return I;
 }
 template <typename T>
 static void split_LU(const utils::Matrix<T>& lu,
                     utils::Matrix<T>& L,
                     utils::Matrix<T>& U) {
    const std::uint64_t n = lu.rows();
    L.resize(n,n,T(0));
    U.resize(n,n,T(0));
    for (std::uint64_t i=0;i<n;++i) {
        for (std::uint64_t j=0;j<n;++j) {
            if (i>j)      L(i,j) = lu(i,j);
            else if (i==j){ L(i,i) = T(1); U(i,i) = lu(i,i); }
            else          U(i,j) = lu(i,j);
        }
    }
 }
 template <typename T>
 static utils::Matrix<T> permutation_from_indx(const std::vector<std::uint64_t>& indx) {
    const std::uint64_t n = indx.size();
    auto P = identity<T>(n);
    // Apply the same sequence of row swaps that was applied during factorization
    for (std::uint64_t k=0;k<n;++k) {
        const std::uint64_t imax = indx[k];
        if (imax != k) P.swap_rows(k, imax);
    }
    return P;
 }
 // A well-conditioned 3x3 (symmetric positive definite)
 static utils::Matrix<double> make_A_spd() {
    utils::Matrix<double> A(3,3,0.0);
    // [ 4  3  0
    //   3  4 -1
    //   0 -1  4 ]
    A(0,0)=4; A(0,1)=3;  A(0,2)=0;
    A(1,0)=3; A(1,1)=4;  A(1,2)=-1;
    A(2,0)=0; A(2,1)=-1; A(2,2)=4;
    return A;
 }
 TEST_CASE(LU_PA_equals_LU) {
    auto A = make_A_spd();
    decomp::LUdcmpd lu(A);
    utils::Matrix<double> L,U;
    split_LU(lu.lu, L, U);
    auto P = permutation_from_indx<double>(lu.indx);
    auto PA  = numerics::matmul(P, A);
    auto LU  = numerics::matmul(L, U);
    CHECK(mats_equal_tol(PA, LU, 1e-12), "PA should equal LU");
 }
 TEST_CASE(LU_Solve_Vector) {
    auto A = make_A_spd();
    decomp::LUdcmpd lu(A);
    utils::Vd b(3,0.0);
    b[0]=1.0; b[1]=2.0; b[2]=3.0;
    auto x = lu.solve(b);
    auto Ax = numerics::matvec(A, x);
    CHECK(b.nearly_equal_vec(Ax, 1e-12), "A*x should equal b");
 }
 TEST_CASE(LU_Solve_Matrix_MultiRHS) {
    auto A = make_A_spd();
    decomp::LUdcmpd lu(A);
    utils::Matrix<double> B(3,2,0.0);
    // two RHS columns
    B(0,0)=1; B(1,0)=2; B(2,0)=3;
    B(0,1)=4; B(1,1)=5; B(2,1)=6;
    auto X = lu.solve(B);                 // 3x2
    // Check A*X == B
    auto AX = numerics::matmul(A, X);
    CHECK(mats_equal_tol(AX, B, 1e-12), "A*X should equal B");
    // And that column-wise solve agrees
    utils::Vd b0(3,0.0), b1(3,0.0);
    for (int i=0;i<3;++i){ b0[i]=B(i,0); b1[i]=B(i,1); }
    auto x0 = lu.solve(b0);
    auto x1 = lu.solve(b1);
    CHECK(std::fabs(double(X(0,0)-x0[0]))<1e-12 &&
          std::fabs(double(X(1,0)-x0[1]))<1e-12 &&
          std::fabs(double(X(2,0)-x0[2]))<1e-12, "column 0 mismatch");
    CHECK(std::fabs(double(X(0,1)-x1[0]))<1e-12 &&
          std::fabs(double(X(1,1)-x1[1]))<1e-12 &&
          std::fabs(double(X(2,1)-x1[2]))<1e-12, "column 1 mismatch");
 }
 TEST_CASE(LU_Determinant) {
    auto A = make_A_spd();
    decomp::LUdcmpd lu(A);
    // For this A, det = 24
    double d = lu.det();
    CHECK(std::fabs(d - 24.0) < 1e-12, "determinant incorrect");
 }
 TEST_CASE(LU_Inverse_via_SolveI) {
    auto A = make_A_spd();
    decomp::LUdcmpd lu(A);
    // Build identity and solve A * X = I
    auto I  = identity<double>(3);
    auto Inv = lu.solve(I);
    // Check A*Inv == I (and Inv*A == I for good measure)
    auto AInv = numerics::matmul(A, Inv);
    auto InvA = numerics::matmul(Inv, A);
    CHECK(mats_equal_tol(AInv, I, 1e-11), "A*Inv should be I");
    CHECK(mats_equal_tol(InvA, I, 1e-11), "Inv*A should be I");
 }
 TEST_CASE(LU_NonSquare_Throws) {
    utils::Matrix<double> A(2,3,1.0);
    bool threw=false;
    try { decomp::LUdcmpd lu(A); } catch (const std::runtime_error&) { threw = true; }
    CHECK(threw, "LU should throw on non-square");
 }
 TEST_CASE(LU_Singular_Throws) {
    utils::Matrix<double> A(3,3,0.0);
    // Make two identical rows
    A(0,0)=1; A(0,1)=2; A(0,2)=3;
    A(1,0)=1; A(1,1)=2; A(1,2)=3;
    A(2,0)=0; A(2,1)=1; A(2,2)=4;
    bool threw=false;
    try { decomp::LUdcmpd lu(A); } catch (const std::runtime_error&) { threw = true; }
    CHECK(threw, "LU should throw on singular matrix");
 }
--- a/test/test_matequal.cpp
+++ b/test/test_matequal.cpp
@@ -0,0 +1,108 @@
 #include "test_common.h"
 #include "./numerics/matequal.h" 
 using utils::Vf; using utils::Vd; using utils::Vi;
 using utils::Mf; using utils::Md; using utils::Mi;
 // ---------- helpers ----------
 template <typename T>
 static void fill_seq(utils::Matrix<T>& M, T start = T{0}, T step = T{1}) {
    std::uint64_t k = 0;
    for (std::uint64_t i = 0; i < M.rows(); ++i)
        for (std::uint64_t j = 0; j < M.cols(); ++j, ++k)
            M(i,j) = start + step * static_cast<T>(k);
 }
 // ---------- tests ----------
 TEST_CASE(matequal_shape_mismatch) {
    utils::Mi A(3,3,0), B(3,4,0);
    CHECK(!numerics::matequal(A,B), "shape mismatch should be false (serial)");
 #ifdef _OPENMP
    CHECK(!numerics::matequal_omp(A,B), "shape mismatch should be false (omp)");
 #endif
    CHECK(!numerics::matequal_auto(A,B), "shape mismatch should be false (auto)");
 }
 TEST_CASE(matequal_int_true_false) {
    utils::Mi A(4,5,0), B(4,5,0);
    fill_seq(A, int64_t(0), int64_t(1));
    fill_seq(B, int64_t(0), int64_t(1));
    CHECK(numerics::matequal(A,B), "ints equal (serial)");
 #ifdef _OPENMP
    CHECK(numerics::matequal_omp(A,B), "ints equal (omp)");
 #endif
    // flip one element
    B(2,3) += 1;
    CHECK(!numerics::matequal(A,B), "ints differ (serial)");
 #ifdef _OPENMP
    CHECK(!numerics::matequal_omp(A,B), "ints differ (omp)");  // will FAIL if your omp branch uses '!='
 #endif
 }
 TEST_CASE(matequal_double_tolerance) {
    utils::Md A(3,3,0.0), B(3,3,0.0);
    fill_seq(A, double(1.0), double(0.125));
    fill_seq(B, double(1.0), double(0.125));
    // tiny perturbation within default tol
    B(1,1) += 1e-12;
    CHECK(numerics::matequal(A,B), "double within tol (serial)");
 #ifdef _OPENMP
    CHECK(numerics::matequal_omp(A,B), "double within tol (omp)");
 #endif
    // larger perturbation exceeds tol
    B(0,2) += 1e-6;
    CHECK(!numerics::matequal(A,B, 1e-9), "double exceeds tol (serial)");
 #ifdef _OPENMP
    CHECK(!numerics::matequal_omp(A,B, 1e-9), "double exceeds tol (omp)");
 #endif
 }
 TEST_CASE(matequal_auto_agrees) {
    // Choose size so auto likely takes the OMP path when available,
    // but this test only checks correctness, not which path was taken.
    utils::Md A(256,256,0.0), B(256,256,0.0);
    fill_seq(A, double(0.0), double(0.01));
    fill_seq(B, double(0.0), double(0.01));
    CHECK(numerics::matequal_auto(A,B), "auto equal");
    B(5,7) += 1e-3;
    CHECK(!numerics::matequal_auto(A,B, 1e-9), "auto detects mismatch");
 }
 #ifdef _OPENMP
 TEST_CASE(mateequal_omp_nested_callsite) {
    // Verify correctness when called inside an outer parallel region.
    utils::Mi A(128,128,0), B(128,128,0);
    fill_seq(A, int64_t(0), int64_t(1));
    fill_seq(B, int64_t(0), int64_t(1));
    // allow one nested level; inner region inside mateequal_omp may spawn a team
    int prev_levels = omp_get_max_active_levels();
    omp_set_max_active_levels(2);
    bool ok_equal = false, ok_diff = false;
    #pragma omp parallel num_threads(2) shared(ok_equal, ok_diff)
    {
        #pragma omp single
        {
            ok_equal = numerics::matequal_omp(A,B);
            B(10,10) += 1; // introduce a mismatch
            ok_diff = !numerics::matequal_omp(A,B);
        }
    }
    omp_set_max_active_levels(prev_levels);
    CHECK(ok_equal, "nested equal should be true");
    CHECK(ok_diff, "nested mismatch should be false");
 }
 #endif
--- a/test/test_matmul.cpp
+++ b/test/test_matmul.cpp
@@ -0,0 +1,127 @@
 #include "test_common.h"
 #include "./utils/utils.h"
 #include "./numerics/matmul.h"
 #include <chrono>
 // ---------- helpers ----------
 template <typename T>
 static bool mats_equal(const utils::Matrix<T>& X, const utils::Matrix<T>& Y, double tol = 0.0) {
    if (X.rows()!=Y.rows() || X.cols()!=Y.cols()) return false;
    if (std::is_floating_point<T>::value) {
        for (std::uint64_t i=0;i<X.rows();++i)
            for (std::uint64_t j=0;j<X.cols();++j)
                if (std::fabs(double(X(i,j)) - double(Y(i,j))) > tol) return false;
    } else {
        for (std::uint64_t i=0;i<X.rows();++i)
            for (std::uint64_t j=0;j<X.cols();++j)
                if (X(i,j) != Y(i,j)) return false;
    }
    return true;
 }
 template <typename T>
 static void fill_seq(utils::Matrix<T>& M, T start = T(0), T step = T(1)) {
    std::uint64_t k = 0;
    for (std::uint64_t i=0;i<M.rows();++i)
        for (std::uint64_t j=0;j<M.cols();++j,++k)
            M(i,j) = start + step * static_cast<T>(k);
 }
 // ---------- tests ----------
 // Small known example: (3x2) · (2x3)
 TEST_CASE(Matmul_Small_Known) {
    utils::Mi A(3,2,0), B(2,3,0);
    // A = [1 2; 3 4; 5 6]
    A(0,0)=1; A(0,1)=2;
    A(1,0)=3; A(1,1)=4;
    A(2,0)=5; A(2,1)=6;
    // B = [7 8 9; 10 11 12]
    B(0,0)=7;  B(0,1)=8;  B(0,2)=9;
    B(1,0)=10; B(1,1)=11; B(1,2)=12;
    auto C = numerics::matmul(A,B);
    CHECK(C.rows()==3 && C.cols()==3, "shape 3x3 wrong");
    // Expected C:
    // [27 30 33]
    // [61 68 75]
    // [95 106 117]
    CHECK(C(0,0)==27 && C(0,1)==30 && C(0,2)==33, "row 0 wrong");
    CHECK(C(1,0)==61 && C(1,1)==68 && C(1,2)==75, "row 1 wrong");
    CHECK(C(2,0)==95 && C(2,1)==106 && C(2,2)==117, "row 2 wrong");
 }
 TEST_CASE(Matmul_DimMismatch_Throws) {
    utils::Md A(2,3,1.0), B(4,2,2.0); // A.cols()!=B.rows()
    bool threw=false;
    try { (void)numerics::matmul(A,B); } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "matmul should throw on dim mismatch");
 }
 // Compare all variants vs serial on a moderate size
 TEST_CASE(Matmul_Variants_Equal_Int) {
    const std::uint64_t m=32, n=24, p=16;
    utils::Mi A(m,n,0), B(n,p,0);
    // deterministic fill (no randomness)
    fill_seq(A, int64_t(1), int64_t(1));
    fill_seq(B, int64_t(2), int64_t(3));
    auto C_ref = numerics::matmul(A,B);
    auto C_rows     = numerics::matmul_rows_omp(A,B);
    auto C_collapse = numerics::matmul_collapse_omp(A,B);
    auto C_auto     = numerics::matmul_auto(A,B);
    CHECK(mats_equal(C_rows,     C_ref), "rows_omp != serial");
    CHECK(mats_equal(C_collapse, C_ref), "collapse_omp != serial");
    CHECK(mats_equal(C_auto,     C_ref), "auto != serial");
 }
 TEST_CASE(Matmul_Variants_Equal_Double) {
    const std::uint64_t m=33, n=17, p=19;
    utils::Md A(m,n,0.0), B(n,p,0.0);
    fill_seq(A, 0.1, 0.01);
    fill_seq(B, 1.0, 0.02);
    auto C_ref = numerics::matmul(A,B);
    auto C_rows     = numerics::matmul_rows_omp(A,B);
    auto C_collapse = numerics::matmul_collapse_omp(A,B);
    auto C_auto     = numerics::matmul_auto(A,B);
    CHECK(mats_equal(C_rows,     C_ref, 1e-9), "rows_omp != serial (double)");
    CHECK(mats_equal(C_collapse, C_ref, 1e-9), "collapse_omp != serial (double)");
    CHECK(mats_equal(C_auto,     C_ref, 1e-9), "auto != serial (double)");
 }
 // Nested callsite sanity: call OMP variant from within an outer region
 #ifdef _OPENMP
 TEST_CASE(Matmul_OMP_Nested_Callsite) {
    const std::uint64_t m=48, n=24, p=32;
    utils::Mi A(m,n,0), B(n,p,0);
    fill_seq(A, int64_t(1), int64_t(2));
    fill_seq(B, int64_t(3), int64_t(1));
    auto C_ref = numerics::matmul(A,B);
    int prev_levels = omp_get_max_active_levels();
    omp_set_max_active_levels(2);
    utils::Mi C_nested;
    #pragma omp parallel num_threads(2)
    {
        #pragma omp single
        {
            // either variant is fine; collapse(2) has more parallelism
            C_nested = numerics::matmul_collapse_omp(A,B);
        }
    }
    omp_set_max_active_levels(prev_levels);
    CHECK(mats_equal(C_nested, C_ref), "nested collapse_omp result mismatch");
 }
 #endif
--- a/test/test_matrix.cpp
+++ b/test/test_matrix.cpp
@@ -0,0 +1,164 @@
 #include "test_common.h"
 #include "./utils/matrix.h"
 using utils::Vf; using utils::Vd; using utils::Vi;
 using utils::Mf; using utils::Md; using utils::Mi;
 // tiny helper
 template <typename T>
 static bool mat_is_filled(const utils::Matrix<T>& M, T v) {
    for (std::uint64_t i = 0; i < M.rows(); ++i)
        for (std::uint64_t j = 0; j < M.cols(); ++j)
            if (M(i,j) != v) return false;
    return true;
 }
 // ------------------- basic construction -------------------
 TEST_CASE(Matrix_Default_Construct) {
    Md M;
    CHECK_EQ(M.rows(), 0, "default rows should be 0");
    CHECK_EQ(M.cols(), 0, "default cols should be 0");
 }
 TEST_CASE(Matrix_Filled_Construct) {
    Md M(3, 4, 2.5);
    CHECK_EQ(M.rows(), 3, "rows");
    CHECK_EQ(M.cols(), 4, "cols");
    CHECK(mat_is_filled(M, 2.5), "all elements should be 2.5");
 }
 // ------------------- element access / write -------------------
 TEST_CASE(Matrix_Set_Get) {
    Mi M(2, 3, 0);
    M(0,0) = 42;
    M(1,2) = -7;
    CHECK_EQ(M(0,0), 42, "set/get (0,0)");
    CHECK_EQ(M(1,2), -7, "set/get (1,2)");
 }
 // ------------------- resize semantics -------------------
 TEST_CASE(Matrix_Resize_Grow) {
    Mf M(2, 2, 1.0f);
    M.resize(3, 4, 9.0f); // grow; newly appended elements get the fill value
    CHECK_EQ(M.rows(), 3, "rows after resize");
    CHECK_EQ(M.cols(), 4, "cols after resize");
    CHECK(M(2,3) == 9.0f, "last element should be the fill value after grow");
 }
 TEST_CASE(Matrix_Resize_Shrink) {
    Mi M(4, 4, 5);
    M(0,0) = 11;
    M.resize(2, 2, 999); // shrink; size reduces
    CHECK_EQ(M.rows(), 2, "rows after shrink");
    CHECK_EQ(M.cols(), 2, "cols after shrink");
    // element mapping after shrink is implementation dependent; just check bounds usable
    M(1,1) = 3;
    CHECK_EQ(M(1,1), 3, "write after shrink works");
 }
 // ------------------- row helpers -------------------
 TEST_CASE(Matrix_Get_Row) {
    Mi M(3,4,0);
    // set row 1 to [10,20,30,40]
    for (std::uint64_t j=0;j<4;++j) M(1,j) = (j+1)*10;
    auto r = M.get_row(1);
    CHECK_EQ(r.size(), 4, "row size");
    CHECK(r[0]==10 && r[1]==20 && r[2]==30 && r[3]==40, "row contents");
 }
 TEST_CASE(Matrix_Set_Row) {
    Mi M(2,3,0);
    utils::Vector<int64_t> v(3, 0);
    v[0]=7; v[1]=8; v[2]=9;
    M.set_row(0, v);
    CHECK(M(0,0)==7 && M(0,1)==8 && M(0,2)==9, "set_row contents");
 }
 TEST_CASE(Matrix_Row_OutOfRange_Throws) {
    Mi M(2,2,0);
    bool threw=false;
    try { (void)M.get_row(2); } catch(const std::out_of_range&) { threw=true; }
    CHECK(threw, "get_row out-of-range should throw");
    threw=false;
    try {
        utils::Vector<int64_t> v(3,1); // wrong size
        M.set_row(1, v);
    } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "set_row size mismatch should throw");
 }
 // ------------------- col helpers -------------------
 TEST_CASE(Matrix_Get_Col) {
    Mi M(3,2,0);
    M(0,1)=5; M(1,1)=6; M(2,1)=7;
    auto c = M.get_col(1);
    CHECK_EQ(c.size(), 3, "col size");
    CHECK(c[0]==5 && c[1]==6 && c[2]==7, "col contents");
 }
 TEST_CASE(Matrix_Set_Col) {
    Mi M(3,2,0);
    utils::Vector<int64_t> v(3, 0);
    v[0]=1; v[1]=4; v[2]=9;
    M.set_col(0, v);
    CHECK(M(0,0)==1 && M(1,0)==4 && M(2,0)==9, "set_col contents");
 }
 TEST_CASE(Matrix_Col_OutOfRange_Throws) {
    Mi M(2,2,0);
    bool threw=false;
    try { (void)M.get_col(2); } catch(const std::out_of_range&) { threw=true; }
    CHECK(threw, "get_col out-of-range should throw");
    threw=false;
    try {
        utils::Vector<int64_t> v(1,1); // wrong size
        M.set_col(1, v);
    } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "set_col size mismatch should throw");
 }
 // ------------------- swap rows/cols -------------------
 TEST_CASE(Matrix_Swap_Rows) {
    Mi M(2,3,0);
    // row0: 1 2 3, row1: 4 5 6
    for (std::uint64_t j=0;j<3;++j){ M(0,j)=j+1; M(1,j)=j+4; }
    M.swap_rows(0,1);
    CHECK(M(0,0)==4 && M(0,1)==5 && M(0,2)==6, "row0 after swap");
    CHECK(M(1,0)==1 && M(1,1)==2 && M(1,2)==3, "row1 after swap");
 }
 TEST_CASE(Matrix_Swap_Cols) {
    Mi M(3,3,0);
    // col0: 9 8 7, col2: 1 2 3
    M(0,0)=9; M(1,0)=8; M(2,0)=7;
    M(0,2)=1; M(1,2)=2; M(2,2)=3;
    M.swap_cols(0,2);
    CHECK(M(0,0)==1 && M(1,0)==2 && M(2,0)==3, "col0 after swap");
    CHECK(M(0,2)==9 && M(1,2)==8 && M(2,2)==7, "col2 after swap");
 }
 TEST_CASE(Matrix_Swap_OutOfRange_Throws) {
    Mi M(2,2,0);
    bool threw=false;
    try { M.swap_rows(0,2); } catch(const std::out_of_range&) { threw=true; }
    CHECK(threw, "swap_rows out-of-range should throw");
    threw=false;
    try { M.swap_cols(0,3); } catch(const std::out_of_range&) { threw=true; }
    CHECK(threw, "swap_cols out-of-range should throw");
 }
 // ------------------- stream output (basic sanity) -------------------
 TEST_CASE(Matrix_Stream_ToString) {
    Mf M(2,2,0.0f);
    M(0,0)=1.0f; M(0,1)=2.0f; M(1,0)=3.0f; M(1,1)=4.0f;
    std::ostringstream os;
    os << M;
    auto s = os.str();
    CHECK(s.find("[[") != std::string::npos, "starts with [[");
    CHECK(s.find("1.000") != std::string::npos, "contains formatted 1.000");
    CHECK(s.find("4.000") != std::string::npos, "contains formatted 4.000");
 }
--- a/test/test_matvec.cpp
+++ b/test/test_matvec.cpp
@@ -0,0 +1,236 @@
 #include "test_common.h"
 #include "./numerics/matvec.h"    // numerics::matvec / inplace_transpose
 #include <chrono>
 using utils::Vi; using utils::Vf; using utils::Vd;
 using utils::Mi; using utils::Mf; using utils::Md;
 // ------------------------------------------------------------
 // matvec: y = A * x
 // ------------------------------------------------------------
 TEST_CASE(Matvec_Serial_Simple) {
    // A = [[1,2,3],
    //      [4,5,6]]
    Md A(2,3,0.0);
    A(0,0)=1; A(0,1)=2; A(0,2)=3;
    A(1,0)=4; A(1,1)=5; A(1,2)=6;
    Vd x(3,0.0); x[0]=7; x[1]=8; x[2]=9;
    auto y = numerics::matvec<double>(A,x); // [ 1*7+2*8+3*9 , 4*7+5*8+6*9 ] = [50, 122]
    CHECK(y.size()==2, "matvec size wrong");
    CHECK(y[0]==50.0 && y[1]==122.0, "matvec values wrong");
 }
 TEST_CASE(Matvec_OMP_Equals_Serial) {
    Md A(3,3,0.0);
    // A = I * 2
    for (uint64_t i=0;i<3;++i) A(i,i)=2.0;
    Vd x(3,0.0); x[0]=1; x[1]=2; x[2]=3;
    auto ys = numerics::matvec<double>(A,x);
    auto yp = numerics::matvec_omp<double>(A,x);
    CHECK((ys.nearly_equal_vec(yp)), "matvec_omp != matvec");
 }
 TEST_CASE(Matvec_Auto_Equals_Serial) {
    Md A(2,2,0.0); A(0,0)=2; A(0,1)=1; A(1,0)=0.5; A(1,1)=3;
    Vd x(2,0.0); x[0]=4; x[1]=5;
    auto ys = numerics::matvec<double>(A,x);
    auto ya = numerics::matvec_auto<double>(A,x);
    CHECK((ys.nearly_equal_vec(ya)), "matvec_auto != serial");
 }
 TEST_CASE(Matvec_DimensionMismatch_Throws) {
    Md A(2,3,0.0);
    Vd x(4,0.0);
    bool threw=false;
    try { auto _ = numerics::matvec<double>(A,x); (void)_; }
    catch (const std::runtime_error&) { threw=true; }
    CHECK(threw, "matvec must throw on dimension mismatch");
 }
 TEST_CASE(Matvec_Zero_Edges) {
    Md A(0,3,0.0);            // 0x3
    Vd x(3,1.0);
    auto y = numerics::matvec<double>(A,x);
    CHECK(y.size()==0, "0xN * x should return size 0 vector");
    Md B(2,0,0.0);            // 2x0
    Vd z(0,0.0);
    auto y2 = numerics::matvec<double>(B,z);
    CHECK(y2.size()==2 && y2[0]==0.0 && y2[1]==0.0, "N×0 * 0 should return zeros of size N");
 }
 TEST_CASE(Matvec_Float_Tolerance) {
    Mf A(2,2,0.0f); A(0,0)=1.0f; A(0,1)=2.0f; A(1,0)=3.0f; A(1,1)=4.0f;
    Vf x(2,0.0f); x[0]=0.1f; x[1]=0.2f;
    auto y1 = numerics::matvec<float>(A,x);
    auto y2 = numerics::matvec_omp<float>(A,x);
    CHECK((y1.nearly_equal_vec(y2,1e-6f)), "matvec float omp mismatch");
 }
 //
 // ---------- Auto inside an outer parallel region (no accidental nested teams) ----------
 //   We just check correctness; performance is environment-dependent.
 //
 TEST_CASE(Matvec_Auto_Inside_Outer_Parallel_Correctness) {
    const uint64_t m=64, n=64;
    Md A(m,n,1.0); Vd x(n,2.0);
    //fill_deterministic(A); fill_deterministic(x);
    Vd ref = numerics::matvec<double>(A,x);
    // Call auto inside an outer team
    #ifdef _OPENMP
    #pragma omp parallel for schedule(static)
    #endif
    for (int rep=0; rep<32; ++rep) {
        auto y = numerics::matvec_auto<double>(A,x);
        // Each thread checks its own result equals reference
        if (!(y.nearly_equal_vec(ref))) {
            throw TestFailure("matvec_auto wrong under outer parallel region");
        }
    }
 }
 TEST_CASE(Matvec_Speed_Sanity) {
    const uint64_t m=4096, n=4096;      // ~16M MACs; adjust if needed
    Md A(m,n,1.0); Vd x(n,2.0);
    //fill_deterministic(A); fill_deterministic(x);
    auto t0 = std::chrono::high_resolution_clock::now();
    auto yS = numerics::matvec(A,x);
    double tp = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - t0).count();
    #ifdef _OPENMP
      int threads = omp_get_max_threads();
    #else
      int threads = 1;
    #endif
    t0 = std::chrono::high_resolution_clock::now();
    auto yP = numerics::matvec_omp(A,x);
    double ts = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - t0).count();
    CHECK((yS.nearly_equal_vec(yP)), "matvec_omp != matvec_serial (large)");
    // Only enforce basic sanity if we *can* use >1 threads:
    if (threads > 1) {
        // Be generous: just require not significantly slower.
        CHECK(tp >= ts, "matvec_omp unexpectedly much slower than serial");
    }
 }
 // ------------------------------------------------------------
 // vecmat: y = x * A
 // ------------------------------------------------------------
 TEST_CASE(Vecmat_Serial_Simple) {
    // A = [[1,2],
    //      [3,4],
    //      [5,6]]  (3x2)
    Md A(3,2,0.0);
    A(0,0)=1; A(0,1)=2;
    A(1,0)=3; A(1,1)=4;
    A(2,0)=5; A(2,1)=6;
    Vd x(3,0.0); x[0]=7; x[1]=8; x[2]=9;
    auto y = numerics::vecmat<double>(x,A); // 1*7+3*8+5*9= 76 ; 2*7+4*8+6*9=100
    CHECK(y.size()==2, "vecmat size wrong");
    CHECK(y[0]==76.0 && y[1]==100.0, "vecmat values wrong");
 }
 TEST_CASE(Vecmat_OMP_Equals_Serial) {
    Md A(2,2,0.0); A(0,0)=2; A(0,1)=1; A(1,0)=5; A(1,1)=-1;
    Vd x(2,0.0); x[0]=0.5; x[1]=1.5;
    auto ys = numerics::vecmat<double>(x,A);
    auto yp = numerics::vecmat_omp<double>(x,A);
    CHECK((ys.nearly_equal_vec(yp)), "vecmat_omp != vecmat");
 }
 TEST_CASE(Vecmat_Auto_Equals_Serial) {
    Md A(2,3,0.0);
    A(0,0)=1; A(0,1)=2; A(0,2)=3;
    A(1,0)=4; A(1,1)=5; A(1,2)=6;
    Vd x(2,0.0); x[0]=1; x[1]=2;
    auto ys = numerics::vecmat<double>(x,A);
    auto ya = numerics::vecmat_auto<double>(x,A);
    CHECK((ys.nearly_equal_vec(ya)), "vecmat_auto != serial");
 }
 TEST_CASE(Vecmat_DimensionMismatch_Throws) {
    Md A(2,2,0.0);
    Vd x(3,0.0);
    bool threw=false;
    try { auto _ = numerics::vecmat<double>(x,A); (void)_; }
    catch (const std::runtime_error&) { threw=true; }
    CHECK(threw, "vecmat must throw on dimension mismatch");
 }
 TEST_CASE(Vecmat_Zero_Edges) {
    Md A(0,3,0.0);
    Vd x(0,0.0);
    auto y = numerics::vecmat<double>(x,A); // 0×N times N×M → 0×M
    CHECK(y.size()==3 && y[0]==0.0 && y[1]==0.0 && y[2]==0.0, "0-length x times A wrong");
    Md B(3,0,0.0);
    Vd z(3,1.0);
    auto y2 = numerics::vecmat<double>(z,B); // 1x3 * 3x0 → 1x0
    CHECK(y2.size()==0, "vecmat with N×0 result size wrong");
 }
 //
 // ---------- Auto inside an outer parallel region (no accidental nested teams) ----------
 //   We just check correctness; performance is environment-dependent.
 //
 TEST_CASE(Vecmat_Auto_Inside_Outer_Parallel_Correctness) {
    const uint64_t m=64, n=64;
    Md A(m,n,1.0); Vd x(m,2.0);
    //fill_deterministic(A); fill_deterministic(x);
    Vd ref = numerics::vecmat<double>(x,A);
    #ifdef _OPENMP
    #pragma omp parallel for schedule(static)
    #endif
    for (int rep=0; rep<32; ++rep) {
        auto y = numerics::vecmat_auto<double>(x,A);
        if (!(y.nearly_equal_vec(ref))) {
            throw TestFailure("vecmat_auto wrong under outer parallel region");
        }
    }
 }
 TEST_CASE(Vecmat_Speed_Sanity) {
    const uint64_t m=4096, n=4096;
    Md A(m,n,1.0); Vd x(m,2.0);
    //fill_deterministic(A); fill_deterministic(x);
    auto t0 = std::chrono::high_resolution_clock::now();
    auto yS = numerics::vecmat<double>(x,A);
    double ts = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - t0).count();
    #ifdef _OPENMP
      int threads = omp_get_max_threads();
    #else
      int threads = 1;
    #endif
    t0 = std::chrono::high_resolution_clock::now();
    auto yP = numerics::vecmat_omp<double>(x,A);
    double tp = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - t0).count();
    CHECK((yS.nearly_equal_vec(yP)), "vecmat_omp != vecmat_serial (large)");
    if (threads > 1) {
        CHECK(tp <= ts, "vecmat_omp unexpectedly much slower than serial");
    }
 }
--- a/test/test_transpose.cpp
+++ b/test/test_transpose.cpp
@@ -0,0 +1,151 @@
 #include "test_common.h"
 //#include "./utils/matrix.h"           // matrix.h, vector.h
 #include "./numerics/transpose.h"    // numerics::transpose / inplace_transpose
 using utils::Mi; using utils::Mf; using utils::Md;
 /// ---- helpers ----
 template <typename T>
 static void fill_seq(utils::Matrix<T>& M, T start = T(0), T step = T(1)) {
    std::uint64_t k = 0;
    for (std::uint64_t i=0; i<M.rows(); ++i)
        for (std::uint64_t j=0; j<M.cols(); ++j, ++k)
            M(i,j) = start + step * static_cast<T>(k);
 }
 template <typename T>
 static bool mats_equal(const utils::Matrix<T>& X, const utils::Matrix<T>& Y) {
    if (X.rows()!=Y.rows() || X.cols()!=Y.cols()) return false;
    for (std::uint64_t i=0; i<X.rows(); ++i)
        for (std::uint64_t j=0; j<X.cols(); ++j)
            if (X(i,j) != Y(i,j)) return false;
    return true;
 }
 // ---- tests ----
 // Empty and 1x1 edge cases
 TEST_CASE(Transpose_Edges) {
    utils::Mi E; // 0x0
    auto Et = numerics::transpose(E);
    CHECK(Et.rows()==0 && Et.cols()==0, "transpose of empty should be empty");
    utils::Mi S(1,1,42);
    auto St = numerics::transpose(S);
    CHECK(St.rows()==1 && St.cols()==1, "1x1 stays 1x1");
    CHECK(St(0,0)==42, "1x1 value preserved");
 }
 // Rectangular out-of-place
 TEST_CASE(Transpose_Rectangular) {
    const std::uint64_t r=3, c=5;
    utils::Mi A(r,c,0);
    fill_seq(A, int64_t(1), int64_t(1));
    auto B = numerics::transpose(A);
    CHECK(B.rows()==c && B.cols()==r, "shape swapped");
    for (std::uint64_t i=0;i<r;++i)
        for (std::uint64_t j=0;j<c;++j)
            CHECK(B(j,i)==A(i,j), "transpose content mismatch");
 }
 // Square: in-place equals out-of-place
 TEST_CASE(Transpose_Inplace_Equals_OutOfPlace) {
    const std::uint64_t n=7;
    utils::Mi A(n,n,0);
    fill_seq(A, int64_t(10), int64_t(3));
    auto B = numerics::transpose(A);
    auto C = A; // copy
    numerics::inplace_transpose_square(C);
    CHECK(mats_equal(B, C), "inplace transpose should match out-of-place");
 }
 // In-place should throw on non-square
 TEST_CASE(Transpose_Inplace_Throws_On_Rect) {
    utils::Mi A(2,3,0);
    bool threw=false;
    try { numerics::inplace_transpose_square(A); } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "inplace_transpose_square must throw on non-square");
 }
 // --- OMP variants (compile only with -fopenmp) ---
 #ifdef _OPENMP
 TEST_CASE(Transpose_OMP_OutOfPlace_Equals_Serial) {
    const std::uint64_t r=17, c=31;
    utils::Mi A(r,c,0);
    fill_seq(A, int64_t(5), int64_t(2));
    auto B_serial = numerics::transpose(A);
    auto B_omp    = numerics::transpose_omp(A);
    CHECK(mats_equal(B_serial, B_omp), "transpose_omp != transpose");
 }
 TEST_CASE(Transpose_OMP_Inplace_Equals_Serial) {
    const std::uint64_t n=32;
    utils::Mi A(n,n,0);
    fill_seq(A, int64_t(0), int64_t(1));
    auto B_ref = numerics::transpose(A);
    auto C = A;
    numerics::inplace_transpose_square_omp(C);
    CHECK(mats_equal(B_ref, C), "inplace_transpose_square_omp != transpose");
 }
 // Auto selectors (if you added transpose_auto / inplace_transpose_square_auto_auto)
 TEST_CASE(Transpose_Auto_Equals_Serial) {
    // Rectangular: transpose_auto
    utils::Mi A(23,11,0);
    fill_seq(A, int64_t(1), int64_t(1));
    auto B_ref = numerics::transpose(A);
    auto B_auto = numerics::transpose_auto(A);
    CHECK(mats_equal(B_ref, B_auto), "transpose_auto != transpose");
    // Square: inplace_transpose_square_auto
    utils::Mi S(29,29,0);
    fill_seq(S, int64_t(4), int64_t(7));
    auto Tref = numerics::transpose(S);
    numerics::inplace_transpose_square_auto(S);
    CHECK(mats_equal(Tref, S), "inplace_transpose_square_auto != transpose");
 }
 // Nested callsite sanity: call OMP versions inside an outer region
 TEST_CASE(Transpose_OMP_Nested_Callsite) {
    // Out-of-place on rectangular
    utils::Mi A(19,37,0);
    fill_seq(A, int64_t(2), int64_t(3));
    auto Bref = numerics::transpose(A);
    int prev_levels = omp_get_max_active_levels();
    omp_set_max_active_levels(2);
    utils::Mi Bnest;
    #pragma omp parallel num_threads(2)
    {
        #pragma omp single
        {
            Bnest = numerics::transpose_omp(A);
        }
    }
    CHECK(mats_equal(Bref, Bnest), "nested transpose_omp mismatch");
    // In-place on square
    utils::Mi S(41,41,0);
    fill_seq(S, int64_t(1), int64_t(5));
    auto Sref = numerics::transpose(S);
    #pragma omp parallel num_threads(2)
    {
        #pragma omp single
        {
            numerics::inplace_transpose_square_omp(S);
        }
    }
    omp_set_max_active_levels(prev_levels);
    CHECK(mats_equal(Sref, S), "nested inplace_transpose_square_omp mismatch");
 }
 #endif
--- a/test/test_vector.cpp
+++ b/test/test_vector.cpp
@@ -0,0 +1,196 @@
 #include "test_common.h"
 #include "./utils/utils.h"
 using utils::Vf; using utils::Vd; using utils::Vi;
 // ---------- helpers ----------
 template <typename T>
 static bool vec_equal_exact(const utils::Vector<T>& a, const utils::Vector<T>& b) {
    if (a.size() != b.size()) return false;
    for (std::uint64_t i=0;i<a.size();++i) if (a[i]!=b[i]) return false;
    return true;
 }
 // ---------- tests ----------
 TEST_CASE(Vector_Construct_Size_Fill) {
    utils::Vd v0;
    CHECK(v0.size()==0, "default size should be 0");
    utils::Vd v1(5, 3.5);
    CHECK(v1.size()==5, "size 5");
    for (std::uint64_t i=0;i<5;++i) CHECK(v1[i]==3.5, "filled value 3.5");
 }
 TEST_CASE(Vector_PushBack_Resize) {
    utils::Vi v;
    v.push_back(1); v.push_back(2);
    CHECK(v.size()==2, "push_back size");
    CHECK(v[0]==1 && v[1]==2, "push_back contents");
    v.resize(5, 7);
    CHECK(v.size()==5, "resize size");
    CHECK(v[2]==7 && v[3]==7 && v[4]==7, "resize fill value");
 }
 TEST_CASE(Vector_Data_ReadWrite) {
    utils::Vd v(4, 0.0);
    double* p = v.data();
    for (std::uint64_t i=0;i<4;++i) p[i] = double(i+1);
    CHECK(v[0]==1.0 && v[3]==4.0, "write via data()");
    const utils::Vd& cv = v;
    const double* cp = cv.data();
    double s=0.0; for (std::uint64_t i=0;i<cv.size();++i) s += cp[i];
    CHECK(std::fabs(s-10.0) < 1e-12, "read via const data()");
 }
 TEST_CASE(Vector_Equality_and_NearlyEqual) {
    utils::Vd a(3, 1.0), b(3, 1.0);
    CHECK(a==b, "operator== equal");
    b[1] += 5e-7;
    CHECK(a==b, "operator== within eps (1e-6)");
    b[1] += 2e-6;
    CHECK(!(a==b), "operator== beyond eps");
    utils::Vd c = a;
    c[2] += 1e-10;
    CHECK(c.nearly_equal_vec(a, 1e-9), "nearly_equal_vec within tol");
    c[2] += 1e-6;
    CHECK(!c.nearly_equal_vec(a, 1e-9), "nearly_equal_vec beyond tol");
 }
 TEST_CASE(Vector_Scalar_Arithmetic) {
    utils::Vi v(3, 10); // [10,10,10]
    auto vadd = v + 2;              // [12,12,12]
    CHECK(vadd[0]==12 && vadd[2]==12, "scalar +");
    v += 3;                          // [13,13,13]
    CHECK(v[1]==13, "scalar +=");
    auto vsub = v - 5;               // [8,8,8]
    CHECK(vsub[2]==8, "scalar -");
    v -= 3;                          // [10,10,10]
    CHECK(v[0]==10, "scalar -=");
    auto vmul = v * 2;               // [20,20,20]
    CHECK(vmul[0]==20 && vmul[1]==20, "scalar *");
    v *= 3;                          // [30,30,30]
    CHECK(v[2]==30, "scalar *=");
    auto vdiv = v / 3;               // [10,10,10]
    CHECK(vdiv[0]==10 && vdiv[1]==10, "scalar /");
    v /= 2;                          // [15,15,15]
    CHECK(v[0]==15 && v[2]==15, "scalar /=");
 }
 TEST_CASE(Vector_Vector_Arithmetic) {
    utils::Vi a(4, 1), b(4, 2);
    auto c = a + b;  // [3,3,3,3]
    CHECK(c[0]==3 && c[3]==3, "v+v");
    a += b;          // [3,3,3,3]
    CHECK(a[1]==3, "v+=v");
    auto d = a - b;  // [1,1,1,1]
    CHECK(d[2]==1, "v-v");
    a -= b;          // [1,1,1,1]
    CHECK(a[0]==1, "v-=v");
    auto e = a * b;  // [2,2,2,2]
    CHECK(e[1]==2, "v*v (elemwise)");
    a *= b;          // [2,2,2,2]
    CHECK(a[3]==2, "v*=v");
    auto f = e / b;  // [1,1,1,1]
    CHECK(f[0]==1 && f[3]==1, "v/v (elemwise)");
    e /= b;          // [1,1,1,1]
    CHECK(e[2]==1, "v/=v");
 }
 TEST_CASE(Vector_Friend_Scalar_Left) {
    utils::Vd v(3, 2.0);  // [2,2,2]
    auto s1 = 3.0 + v;    // [5,5,5]
    CHECK(s1[0]==5.0 && s1[2]==5.0, "left scalar +");
    auto s2 = 4.0 * v;    // [8,8,8]
    CHECK(s2[1]==8.0, "left scalar *");
 }
 TEST_CASE(Vector_Power_and_Sqrt) {
    utils::Vd v(3, 4.0);         // [4,4,4]
    auto p = v.power(2.0);       // [16,16,16]
    CHECK(p[0]==16.0 && p[2]==16.0, "power scalar");
    v.inplace_sqrt();            // sqrt([4,4,4]) -> [2,2,2]
    CHECK(v[0]==2.0 && v[1]==2.0, "inplace_sqrt");
 }
 TEST_CASE(Vector_Dot_Sum_Norm) {
    utils::Vd a(3, 0.0), b(3, 0.0);
    a[0]=1.0; a[1]=2.0; a[2]=3.0;      // a = [1,2,3]
    b[0]=4.0; b[1]=5.0; b[2]=6.0;      // b = [4,5,6]
    double dot = a.dot(b);             // 1*4 + 2*5 + 3*6 = 32
    CHECK(std::fabs(dot - 32.0) < 1e-12, "dot");
    double s = a.sum();                // 6
    CHECK(std::fabs(s - 6.0) < 1e-12, "sum");
    double n = a.norm();               // sqrt(14)
    CHECK(std::fabs(n - std::sqrt(14.0)) < 1e-12, "norm");
 }
 TEST_CASE(Vector_Normalize_and_Throws) {
    utils::Vd v(3, 0.0);
    v[0]=3.0; v[1]=4.0; v[2]=0.0;           // norm = 5
    auto u = v.normalize();                  // returns new vector
    CHECK(std::fabs(u.norm() - 1.0) < 1e-12, "normalize() unit length");
    v.inplace_normalize();
    CHECK(std::fabs(v.norm() - 1.0) < 1e-12, "inplace_normalize unit length");
    utils::Vd z(3, 0.0);
    bool threw=false;
    try { z.inplace_normalize(); } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "normalize should throw on zero vector");
 }
 // Size mismatch throws (elementwise ops)
 TEST_CASE(Vector_Size_Mismatch_Throws) {
    utils::Vi a(3,1), b(4,2);
    bool threw=false;
    try { (void)a.dot(b); } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "dot size mismatch should throw");
    threw=false;
    try { a.inplace_add(b); } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "add size mismatch should throw");
    threw=false;
    try { a.inplace_subtract(b); } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "subtract size mismatch should throw");
    threw=false;
    try { a.inplace_multiply(b); } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "multiply size mismatch should throw");
    threw=false;
    try { a.inplace_divide(b); } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "divide size mismatch should throw");
    threw=false;
    try { a.inplace_power(b); } catch(const std::runtime_error&) { threw=true; }
    CHECK(threw, "power size mismatch should throw");
 }
		`@@ -0,0 +1,2 @@`
							`#define TEST_MAIN`
							`#include "test_common.h"`