Ready for fvm steady case

2025-09-21 20:57:02 +02:00
parent 3a53b6ebf7
commit 513f071748
59 changed files with 1813 additions and 983 deletions
@@ -4,161 +4,124 @@

 #include <chrono>

+// ---------- helpers ----------
+template <typename T>
+static bool mats_equal(const utils::Matrix<T>& X, const utils::Matrix<T>& Y, double tol = 0.0) {
+    if (X.rows()!=Y.rows() || X.cols()!=Y.cols()) return false;
+    if (std::is_floating_point<T>::value) {
+        for (std::uint64_t i=0;i<X.rows();++i)
+            for (std::uint64_t j=0;j<X.cols();++j)
+                if (std::fabs(double(X(i,j)) - double(Y(i,j))) > tol) return false;
+    } else {
+        for (std::uint64_t i=0;i<X.rows();++i)
+            for (std::uint64_t j=0;j<X.cols();++j)
+                if (X(i,j) != Y(i,j)) return false;
+    }
+    return true;
+}

-// ============ Basic correctness ============
-TEST_CASE(Matmul_Serial_Simple3x3) {
-    utils::Md A(3,3,0.0), B(3,3,0.0);
-    // A = [[1,2,3],[4,5,6],[7,8,9]]
-    double v=1.0;
-    for (uint64_t i=0;i<3;++i) for (uint64_t j=0;j<3;++j) A(i,j)=v++;
-    // B = [[9,8,7],[6,5,4],[3,2,1]]
-    double w=9.0;
-    for (uint64_t i=0;i<3;++i) for (uint64_t j=0;j<3;++j) B(i,j)=w--;

-    auto C = numerics::matmul<double>(A,B);
-    // Hand-checked first row:
-    // row0 dot columns:
-    // c00 = 1*9 + 2*6 + 3*3 = 30
-    // c01 = 1*8 + 2*5 + 3*2 = 24
-    // c02 = 1*7 + 2*4 + 3*1 = 18
+template <typename T>
+static void fill_seq(utils::Matrix<T>& M, T start = T(0), T step = T(1)) {
+    std::uint64_t k = 0;
+    for (std::uint64_t i=0;i<M.rows();++i)
+        for (std::uint64_t j=0;j<M.cols();++j,++k)
+            M(i,j) = start + step * static_cast<T>(k);
+}
+// ---------- tests ----------
+
+// Small known example: (3x2) · (2x3)
+TEST_CASE(Matmul_Small_Known) {
+    utils::Mi A(3,2,0), B(2,3,0);
+    // A = [1 2; 3 4; 5 6]
+    A(0,0)=1; A(0,1)=2;
+    A(1,0)=3; A(1,1)=4;
+    A(2,0)=5; A(2,1)=6;
+    // B = [7 8 9; 10 11 12]
+    B(0,0)=7;  B(0,1)=8;  B(0,2)=9;
+    B(1,0)=10; B(1,1)=11; B(1,2)=12;
+
+    auto C = numerics::matmul(A,B);
    CHECK(C.rows()==3 && C.cols()==3, "shape 3x3 wrong");
-    CHECK(C(0,0)==30.0 && C(0,1)==24.0 && C(0,2)==18.0, "first row wrong");
+
+    // Expected C:
+    // [27 30 33]
+    // [61 68 75]
+    // [95 106 117]
+    CHECK(C(0,0)==27 && C(0,1)==30 && C(0,2)==33, "row 0 wrong");
+    CHECK(C(1,0)==61 && C(1,1)==68 && C(1,2)==75, "row 1 wrong");
+    CHECK(C(2,0)==95 && C(2,1)==106 && C(2,2)==117, "row 2 wrong");
 }

-TEST_CASE(Matmul_OMP_Equals_Serial) {
-    utils::Md A(4,5,0.0), B(5,3,0.0);
-    // Fill deterministic
-    for (uint64_t i=0;i<A.rows();++i)
-      for (uint64_t j=0;j<A.cols();++j)
-        A(i,j) = 0.1*(1 + (i*17 + j*19)%10);
-    for (uint64_t i=0;i<B.rows();++i)
-      for (uint64_t j=0;j<B.cols();++j)
-        B(i,j) = 0.2*(1 + (i*23 + j*29)%10);
-
-    auto Cs = numerics::matmul<double>(A,B);
-    auto Cr = numerics::matmul_rows_omp<double>(A,B);
-    auto Cc = numerics::matmul_collapse_omp<double>(A,B);
-    auto Ca = numerics::matmul_auto<double>(A,B);
-
-    CHECK((Cs.nearly_equal(Cr, 1e-12)), "rows_omp != serial");
-    CHECK((Cs.nearly_equal(Cc, 1e-12)), "collapse_omp != serial");
-    CHECK((Cs.nearly_equal(Ca, 1e-12)), "auto != serial");
-}
-
-// ============ Dimension mismatch ============
-TEST_CASE(Matmul_DimensionMismatch_Throws) {
-    utils::Md A(2,3,0.0), B(4,2,0.0);
+TEST_CASE(Matmul_DimMismatch_Throws) {
+    utils::Md A(2,3,1.0), B(4,2,2.0); // A.cols()!=B.rows()
    bool threw=false;
-    try { auto _ = numerics::matmul<double>(A,B); (void)_; }
-    catch (const std::runtime_error&) { threw=true; }
-    CHECK(threw, "serial should throw on dim mismatch");
-
-    threw=false; try { auto _ = numerics::matmul_rows_omp<double>(A,B); (void)_; }
-    catch (const std::runtime_error&) { threw=true; }
-    CHECK(threw, "rows_omp should throw on dim mismatch");
-
-    threw=false; try { auto _ = numerics::matmul_collapse_omp<double>(A,B); (void)_; }
-    catch (const std::runtime_error&) { threw=true; }
-    CHECK(threw, "collapse_omp should throw on dim mismatch");
+    try { (void)numerics::matmul(A,B); } catch(const std::runtime_error&) { threw=true; }
+    CHECK(threw, "matmul should throw on dim mismatch");
 }

-// ============ Edge cases ============
-TEST_CASE(Matmul_Edges_ZeroDims) {
-    // (0xK) * (KxP) -> (0xP)
-    utils::Md A0(0,5,0.0), B1(5,3,0.0);
-    auto C0 = numerics::matmul<double>(A0,B1);
-    CHECK(C0.rows()==0 && C0.cols()==3, "0xK * KxP shape wrong");
+// Compare all variants vs serial on a moderate size
+TEST_CASE(Matmul_Variants_Equal_Int) {
+    const std::uint64_t m=32, n=24, p=16;
+    utils::Mi A(m,n,0), B(n,p,0);

-    // (MxK) * (Kx0) -> (Mx0)
-    utils::Md A2(7,4,0.0), B0(4,0,0.0);
-    auto C1 = numerics::matmul<double>(A2,B0);
-    CHECK(C1.rows()==7 && C1.cols()==0, "MxK * Kx0 shape wrong");
+    // deterministic fill (no randomness)
+    fill_seq(A, int64_t(1), int64_t(1));
+    fill_seq(B, int64_t(2), int64_t(3));
+
+    auto C_ref = numerics::matmul(A,B);
+
+    auto C_rows     = numerics::matmul_rows_omp(A,B);
+    auto C_collapse = numerics::matmul_collapse_omp(A,B);
+    auto C_auto     = numerics::matmul_auto(A,B);
+
+    CHECK(mats_equal(C_rows,     C_ref), "rows_omp != serial");
+    CHECK(mats_equal(C_collapse, C_ref), "collapse_omp != serial");
+    CHECK(mats_equal(C_auto,     C_ref), "auto != serial");
 }

-// ============ Identity sanity ============
-TEST_CASE(Matmul_Identity) {
-    const uint64_t n=5;
-    utils::Md I(n,n,0.0), A(n,n,0.0);
-    for (uint64_t i=0;i<n;++i) I(i,i)=1.0;
-    for (uint64_t i=0;i<n;++i)
-      for (uint64_t j=0;j<n;++j)
-        A(i,j) = (i==j)? 2.0 : ( (i<j)? 1.0 : -1.0 );
+TEST_CASE(Matmul_Variants_Equal_Double) {
+    const std::uint64_t m=33, n=17, p=19;
+    utils::Md A(m,n,0.0), B(n,p,0.0);

-    auto L = numerics::matmul<double>(I,A);
-    auto R = numerics::matmul<double>(A,I);
-    CHECK(L == A, "I*A != A");
-    CHECK(R == A, "A*I != A");
+    fill_seq(A, 0.1, 0.01);
+    fill_seq(B, 1.0, 0.02);
+
+    auto C_ref = numerics::matmul(A,B);
+    auto C_rows     = numerics::matmul_rows_omp(A,B);
+    auto C_collapse = numerics::matmul_collapse_omp(A,B);
+    auto C_auto     = numerics::matmul_auto(A,B);
+
+    CHECK(mats_equal(C_rows,     C_ref, 1e-9), "rows_omp != serial (double)");
+    CHECK(mats_equal(C_collapse, C_ref, 1e-9), "collapse_omp != serial (double)");
+    CHECK(mats_equal(C_auto,     C_ref, 1e-9), "auto != serial (double)");
 }

-// ============ Perf sanity (same kernel: 1 thread vs many) ============
-template <class F>
-static double time_it(F&& f, int iters=1) {
-    auto t0 = std::chrono::high_resolution_clock::now();
-    for (int i=0;i<iters;++i) f();
-    auto t1 = std::chrono::high_resolution_clock::now();
-    return std::chrono::duration<double>(t1 - t0).count();
+// Nested callsite sanity: call OMP variant from within an outer region
+#ifdef _OPENMP
+TEST_CASE(Matmul_OMP_Nested_Callsite) {
+    const std::uint64_t m=48, n=24, p=32;
+    utils::Mi A(m,n,0), B(n,p,0);
+    fill_seq(A, int64_t(1), int64_t(2));
+    fill_seq(B, int64_t(3), int64_t(1));
+
+    auto C_ref = numerics::matmul(A,B);
+
+    int prev_levels = omp_get_max_active_levels();
+    omp_set_max_active_levels(2);
+
+    utils::Mi C_nested;
+    #pragma omp parallel num_threads(2)
+    {
+        #pragma omp single
+        {
+            // either variant is fine; collapse(2) has more parallelism
+            C_nested = numerics::matmul_collapse_omp(A,B);
+        }
+    }
+
+    omp_set_max_active_levels(prev_levels);
+
+    CHECK(mats_equal(C_nested, C_ref), "nested collapse_omp result mismatch");
 }
-
-TEST_CASE(Matmul_Perf_Sanity_RowOMP) {
-#ifndef _OPENMP
-    return;
-#else
-    int hw = omp_get_max_threads();
-    if (hw <= 1) return;
-
-    const uint64_t m=512, k=512, p=512; // ~134M MACs; adjust if needed
-    utils::Md A(m,k,0.0), B(k,p,0.0);
-    for (uint64_t i=0;i<m;++i) for (uint64_t j=0;j<k;++j) A(i,j)= (i+j%7)*0.001;
-    for (uint64_t i=0;i<k;++i) for (uint64_t j=0;j<p;++j) B(i,j)= (i*3+j%5)*0.0005;
-
-    // Warm-up
-    (void) numerics::matmul_rows_omp<double>(A,B);
-
-    int prev = omp_get_max_threads();
-    auto t0 = std::chrono::high_resolution_clock::now();
-    omp_set_num_threads(1);
-    numerics::matmul_rows_omp<double>(A,B);
-    double t1 = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - t0).count();
-
-    omp_set_num_threads(hw);
-    t0 = std::chrono::high_resolution_clock::now();
-    numerics::matmul_rows_omp<double>(A,B);
-    double tN = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - t0).count();
-
-    omp_set_num_threads(prev);
-
-    // Must not be notably slower with many threads
-    CHECK(tN <= t1 * 1.05, "rows_omp: multi-thread slower than single-thread");
-#endif
-}
-
-TEST_CASE(Matmul_Perf_Sanity_CollapseOMP) {
-#ifndef _OPENMP
-    return;
-#else
-    int hw = omp_get_max_threads();
-    if (hw <= 1) return;
-
-    const uint64_t m=512, k=512, p=512;
-    utils::Md A(m,k,0.0), B(k,p,0.0);
-    for (uint64_t i=0;i<m;++i) for (uint64_t j=0;j<k;++j) A(i,j)= (i*7+j%11)*0.0003;
-    for (uint64_t i=0;i<k;++i) for (uint64_t j=0;j<p;++j) B(i,j)= (i%13+j)*0.0002;
-
-    (void) numerics::matmul_collapse_omp<double>(A,B); // warm-up
-
-    int prev = omp_get_max_threads();
-    auto t0 = std::chrono::high_resolution_clock::now();
-    omp_set_num_threads(1);
-    numerics::matmul_collapse_omp<double>(A,B);
-    double t1 = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - t0).count();
-
-    omp_set_num_threads(hw);
-    t0 = std::chrono::high_resolution_clock::now();
-    numerics::matmul_collapse_omp<double>(A,B);
-    double tN = std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - t0).count();
-
-    omp_set_num_threads(prev);
-
-    CHECK(tN <= t1 * 1.05, "collapse_omp: multi-thread slower than single-thread");
-#endif
-}
+#endif