Finishing up and starting lu decomp

2025-09-13 21:44:20 +02:00
parent 320436ce98
commit 88087ea6a6
24 changed files with 1502 additions and 699 deletions
@@ -4,31 +4,50 @@
 #include <omp.h>


-// Configure OpenMP behavior at runtime.
-inline void omp_configure(int max_active_levels,
-                          bool dynamic_threads,
-                          const std::vector<int>& threads_per_level = {},
-                          bool bind_close = true)
-{
-    // 1) Allow nested parallel regions (levels of teams)
-    //    Example: outer #pragma omp parallel ... { inner #pragma omp parallel ... }
-    omp_set_max_active_levels(max_active_levels);   // 1 = only top-level; 2+ enables nesting
+namespace omp_config{

-    // 2) Let the runtime shrink/grow thread counts if it thinks it should
-    //    (helps avoid oversubscription when you accidentally ask for too many threads)
-    omp_set_dynamic(dynamic_threads ? 1 : 0);
+    // Configure OpenMP behavior at runtime.
+    inline void omp_configure(int max_active_levels,
+                              bool dynamic_threads,
+                              const std::vector<int>& threads_per_level = {},
+                              bool bind_close = true)
+    {
+        // 1) Allow nested parallel regions (levels of teams)
+        //    Example: outer #pragma omp parallel ... { inner #pragma omp parallel ... }
+        omp_set_max_active_levels(max_active_levels);   // 1 = only top-level; 2+ enables nesting

-    // 3) Thread binding (keep threads near their cores) is controlled via env vars,
-    //    so here we just *recommend* a good default (see below). You *can* setenv()
-    //    in code, but it’s cleaner to do it outside the program.
-    (void)bind_close; // documented below in env var section
+        // 2) Let the runtime shrink/grow thread counts if it thinks it should
+        //    (helps avoid oversubscription when you accidentally ask for too many threads)
+        omp_set_dynamic(dynamic_threads ? 1 : 0);

-    // 4) Top-level default thread count (inner levels are usually set per region)
-    if (!threads_per_level.empty()) {
-        omp_set_num_threads(threads_per_level[0]);  // e.g. 16 for the outermost team
-        // Inner levels:
-        //   - Use num_threads(threads_per_level[L]) on the inner #pragma omp parallel
-        //   - or set OMP_NUM_THREADS="outer,inner,inner2" as an environment variable
+        // 3) Thread binding (keep threads near their cores) is controlled via env vars,
+        //    so here we just *recommend* a good default (see below). You *can* setenv()
+        //    in code, but it’s cleaner to do it outside the program.
+        (void)bind_close; // documented below in env var section
+
+        // 4) Top-level default thread count (inner levels are usually set per region)
+        if (!threads_per_level.empty()) {
+            omp_set_num_threads(threads_per_level[0]);  // e.g. 16 for the outermost team
+            // Inner levels:
+            //   - Use num_threads(threads_per_level[L]) on the inner #pragma omp parallel
+            //   - or set OMP_NUM_THREADS="outer,inner,inner2" as an environment variable
+        }
    }
-}

+    // ---------- Helper: may we create another team? ----------
+    inline bool omp_parallel_allowed() {
+        #ifdef _OPENMP
+          // If we’re not in parallel, we can spawn.
+          if (!omp_in_parallel()) return true;
+
+          // Already inside parallel: allow only if nesting is enabled and not at limit.
+          int level = omp_get_active_level();          // 0 outside parallel, 1 inside, ...
+          int maxlv = omp_get_max_active_levels();     // user/runtime cap
+          return static_cast<bool>(level < maxlv);
+        #else
+          return false; // no OpenMP → no extra teams
+        #endif
+    }
+
+
+} // namespace omp_config
@@ -0,0 +1,4 @@
+#pragma once
+
+#include "./decomp/lu.h"
+
@@ -0,0 +1,61 @@
+#pragma once
+
+#include "./utils/vector.h"
+#include "./utils/matrix.h"
+
+namespace decomp{
+
+	// Stores PA = LU with partial pivoting (row permutations).
+	template <typename T>
+	struct LU{
+		utils::Matrix<T> LUmat; 	// combined L (unit diagonal implied) and U
+		std::vector<uint64_t> piv;	// pivot indices (row permutations)
+		bool singular= false;		// set true if zero (or near-zero) pivots encountered
+
+		LU() = default;
+
+    	explicit LU(const utils::Matrix<T>& A) {
+        	factor(A);
+        }
+
+    	void factor(const utils::Matrix<T>&A){
+
+    		const uint64_t rows = A.rows();
+    		if (rows != A.cols()){
+    			throw std::runtime_error("LU: factor non-square");
+    		}
+
+    		if (rows == 0){
+    			LUmat = A;
+    			piv.clear();
+    			singular = false;
+    			return;
+    		}
+
+    		T big, tmp;
+
+    		LUmat = A;
+    		piv.resize(rows);		// piv stores the implicit scaling of each row.
+    		//double d = 1.0;			// No row interchanges yet.
+
+    		for (uint64_t i = 0; i < rows; ++i){	// Loop over rows to get the implicit scaling information.
+    			big = T{0};
+    			for (uint64_t j = 0; j < rows; ++j){
+    				tmp=std::abs(LUmat(i,j));
+    				if (tmp > big){
+    					big = tmp;
+    				}
+    			}
+    			if (big == T{0}){
+    				throw std::runtime_error("Singular matrix in LU.factor()");
+    			}
+    		}
+    	}
+
+	}; // struct LU
+
+	typedef LU<float> LUf;
+	typedef LU<double> LUd;
+
+
+} // namespace decomp
@@ -5,100 +5,39 @@
 #include "./utils/vector.h"
 #include "./utils/matrix.h"

+#include "./numerics/inverse/inverse_gauss_jordan.h"
+#include "./numerics/inverse/inverse_lu.h"
+
+#include <omp.h>
+

 namespace numerics{
 	
 	template <typename T>
   	void inplace_inverse(utils::Matrix<T>& A, std::string method = "Gauss-Jordan"){
+
+		if (A.rows() != A.cols()) {
+		    throw std::runtime_error("inplace_inverse: non-square matrix");
+		}
+
   		if (method == "Gauss-Jordan"){
-
-   			utils::Matrix<T> B(A.rows(),A.cols(), T{0});
-
-
-   			uint64_t icol{0}, irow{0}, rows{A.rows()}, cols{A.cols()};
-   			double big, dum, pivinv, temp;
-   			utils::Vi indxc(rows,0), indxr(rows,0), ipiv(rows,0);
-
-   			//for (uint64_t j = 0; j < N; ++j){ ipiv[j] = 0;}
-				for (uint64_t i = 0; i < rows; i++){
-					big = 0.0;
-					for (uint64_t j = 0; j < rows; j++){
-						if (ipiv[j] != 1){
-							for (uint64_t k = 0; k < rows; k++){
-								if (ipiv[k] == 0){
-									if (abs(A(j,k)) >= big){
-										big = abs(A(j,k));
-										irow = j;
-										icol = k;
-									}
-								}
-							}
-						}
-					}
-					ipiv[icol]++;
-					if (irow != icol){
-						for (uint64_t l = 0; l < rows; l++){ // SWAP 
-							temp = A(irow,l);
-							A(irow,l) = A(icol,l);
-							A(icol,l) = temp;
-						}
-						for (uint64_t l = 0; l < cols; l++){ // SWAP temp matrix
-							temp = B(irow,l);
-							B(irow,l) = B(icol,l);
-							B(icol,l) = temp;
-						}
-					}
-
-					indxr[i] = irow;
-					indxc[i] = icol;
-					if (A(icol,icol) == 0.0){
-						throw std::runtime_error("utill:inplace_inverse('Gauss-Jordan' - Singular Matrix");
-					}
-					pivinv= 1.0/A(icol,icol);
-					A(icol,icol)=1.0;
-				
-					for (uint64_t l = 0; l < rows; l++){
-						A(icol,l) *= pivinv;
-					}
-					for (uint64_t l = 0; l < cols; l++){
-						B(icol,l) *= pivinv;
-					}
-					for (uint64_t ll = 0; ll < rows; ll++){
-						if (ll != icol){
-							dum = A(ll,icol);
-							A(ll,icol) = 0;
-							for (uint64_t l = 0; l < rows; l++){
-								A(ll,l) -= A(icol,l)*dum;
-							}
-							for (uint64_t l = 0; l < rows; l++){
-								B(ll,l) -= B(icol,l)*dum;
-							}
-						}
-					}
-
-				}
-				//m = temp_m;
-				for (int64_t l = rows-1; l >= 0; l--){
-					if (indxr[l] != indxc[l]){
-						for (uint64_t k = 0; k < rows; k++){
-							temp = A(k,indxr[l]);
-						A(k,indxr[l]) = A(k,indxc[l]);
-						A(k,indxc[l]) = temp;
-						}
-					}
-				}
-   		}
+   			inverse_gj(A);
+		}
 		else{
 			throw std::runtime_error("numerics::inplace_inverse(" + method + ") - Not implemented yet \r \nImplemented: 'Gauss-Jordan',");
 		}
-    }
+	}



 	template <typename T>
 	utils::Matrix<T> inverse(utils::Matrix<T>& A, std::string method = "Gauss-Jordan"){
+
+
 		utils::Matrix<T> B = A;
+
 		inplace_inverse(B, method);
+
 		return B;
 	}

@@ -0,0 +1,94 @@
+#ifndef _inverse_gj_n_
+#define _inverse_gj_n_
+
+
+#include "./utils/vector.h"
+#include "./utils/matrix.h"
+
+#include <omp.h>
+
+
+namespace numerics{
+	
+	template <typename T>
+   	void inverse_gj(utils::Matrix<T>& A){
+		utils::Matrix<T> B(A.rows(),A.cols(), T{0});
+
+
+		uint64_t icol{0}, irow{0}, rows{A.rows()}, cols{A.cols()};
+		double big, dum, pivinv, temp;
+		utils::Vi indxc(rows,0), indxr(rows,0), ipiv(rows,0);
+
+		//for (uint64_t j = 0; j < N; ++j){ ipiv[j] = 0;}
+
+		for (uint64_t i = 0; i < rows; i++){
+			big = 0.0;
+			for (uint64_t j = 0; j < rows; j++){
+				if (ipiv[j] != 1){
+					for (uint64_t k = 0; k < rows; k++){
+						if (ipiv[k] == 0){
+							if (abs(A(j,k)) >= big){
+								big = abs(A(j,k));
+								irow = j;
+								icol = k;
+							}
+						}
+					}
+				}
+			}
+			ipiv[icol]++;
+			if (irow != icol){
+				for (uint64_t l = 0; l < rows; l++){ // SWAP 
+					temp = A(irow,l);
+					A(irow,l) = A(icol,l);
+					A(icol,l) = temp;
+				}
+				for (uint64_t l = 0; l < cols; l++){ // SWAP temp matrix
+					temp = B(irow,l);
+					B(irow,l) = B(icol,l);
+					B(icol,l) = temp;
+				}
+			}
+			indxr[i] = irow;
+			indxc[i] = icol;
+			if (A(icol,icol) == 0.0){
+				throw std::runtime_error("utill:inplace_inverse('Gauss-Jordan' - Singular Matrix");
+			}
+			pivinv= 1.0/A(icol,icol);
+			A(icol,icol)=1.0;
+		
+			for (uint64_t l = 0; l < rows; l++){
+				A(icol,l) *= pivinv;
+			}
+			for (uint64_t l = 0; l < cols; l++){
+				B(icol,l) *= pivinv;
+			}
+			for (uint64_t ll = 0; ll < rows; ll++){
+				if (ll != icol){
+					dum = A(ll,icol);
+					A(ll,icol) = 0;
+					for (uint64_t l = 0; l < rows; l++){
+						A(ll,l) -= A(icol,l)*dum;
+					}
+					for (uint64_t l = 0; l < rows; l++){
+						B(ll,l) -= B(icol,l)*dum;
+					}
+				}
+			}
+
+		}
+		//m = temp_m;
+		for (int64_t l = rows-1; l >= 0; l--){
+			if (indxr[l] != indxc[l]){
+				for (uint64_t k = 0; k < rows; k++){
+					temp = A(k,indxr[l]);
+				A(k,indxr[l]) = A(k,indxc[l]);
+				A(k,indxc[l]) = temp;
+				}
+			}
+		}
+	}
+
+} // namespace numerics
+
+#endif // _inverse_gj_n_
@@ -3,10 +3,12 @@


 #include "./utils/matrix.h"
+#include "./core/omp_config.h"


 namespace numerics{
-	
+
+// ---------------- Serial baseline ----------------
 	template <typename T>
 	utils::Matrix<T> matmul(const utils::Matrix<T>& A, const utils::Matrix<T>& B){

@@ -19,10 +21,8 @@ namespace numerics{
 		const uint64_t p = B.cols();
 		T tmp;

-		utils::Matrix<T> C(m, n, T{0});
+		utils::Matrix<T> C(m, p, T{0});

-		//#pragma omp parallel for collapse(2) schedule(static)
-		#pragma omp parallel for
 		for (uint64_t i = 0; i < m; ++i){
 			for (uint64_t j = 0; j < n; ++j){
 				tmp = A(i,j);
@@ -34,6 +34,85 @@ namespace numerics{
 		return C;
 	}

+// ---------------- Rows-only OpenMP ----------------
+template <typename T>
+utils::Matrix<T> matmul_rows_omp(const utils::Matrix<T>& A,
+                                 const utils::Matrix<T>& B) {
+    if (A.cols() != B.rows()) throw std::runtime_error("matmul_rows_omp: dim mismatch");
+    const uint64_t m=A.rows(), n=A.cols(), p=B.cols();
+
+    utils::Matrix<T> C(m, p, T{0});
+
+    #pragma omp parallel for schedule(static)
+    for (uint64_t i=0;i<m;++i) {
+        for (uint64_t j=0;j<p;++j) {
+            T acc=T{0};
+            for (uint64_t k=0;k<n;++k) {
+            	acc += A(i,k)*B(k,j);
+            }
+            C(i,j)=acc;
+        }
+    }
+    return C;
+}
+
+// -------------- Collapse(2) OpenMP ----------------
+template <typename T>
+utils::Matrix<T> matmul_collapse_omp(const utils::Matrix<T>& A,
+                                     const utils::Matrix<T>& B) {
+    if (A.cols() != B.rows()) throw std::runtime_error("matmul_collapse_omp: dim mismatch");
+    const uint64_t m=A.rows(), n=A.cols(), p=B.cols();
+    utils::Matrix<T> C(m, p, T{0});
+
+    #pragma omp parallel for collapse(2) schedule(static)
+    for (uint64_t i=0;i<m;++i) {
+        for (uint64_t j=0;j<p;++j) {
+            T acc=T{0};
+            for (uint64_t k=0;k<n;++k){
+            	acc += A(i,k)*B(k,j);
+            } 
+            C(i,j)=acc;
+        }
+    }
+    return C;
+}
+
+
+// -------------------- Auto selector ---------------------
+template <typename T>
+utils::Matrix<T> matmul_auto(const utils::Matrix<T>& A,
+                             const utils::Matrix<T>& B) {
+    const uint64_t m=A.rows(), p=B.cols();
+    const uint64_t work = m * p;
+
+    bool can_parallel = omp_config::omp_parallel_allowed();
+
+    #ifdef _OPENMP
+      int threads = omp_get_max_threads();
+    #else
+      int threads = 1;
+    #endif
+
+
+    // Tiny problems: serial is cheapest.
+    if (!can_parallel || work < static_cast<uint64_t>(threads)*4ull) {
+        return matmul(A,B);
+    }
+    // Plenty of (i,j) work → collapse(2) is a great default.
+    else if (work >= 8ull * static_cast<uint64_t>(threads)) {
+        return matmul_collapse_omp(A,B);
+    }
+    // Many rows and very few columns → rows-only cheaper overhead.
+    else if (m >= static_cast<uint64_t>(threads) && p <= 4) {
+        return matmul_rows_omp(A,B);
+    }
+    else{
+	    // Safe fallback
+	    return matmul(A,B);
+    }
+}
+
+



@@ -3,11 +3,13 @@


 #include "./utils/matrix.h"
-
+#include "./core/omp_config.h"

 namespace numerics{
 	
-	// y = A * x, where A is (m×n) and x is length n and y is length m
+// =================================================
+//   y = A * x    (Matrix–Vector product)
+// =================================================
 	template <typename T>
 	utils::Vector<T> matvec(const utils::Matrix<T>& A, const utils::Vector<T>& x) {
 	    if (A.cols() != x.size()) {
@@ -18,6 +20,27 @@ namespace numerics{
 	    const uint64_t n = A.cols();

 	    utils::Vector<T> y(m, T{0});
+
+	    for (uint64_t i = 0; i < m; ++i) {
+	        for (uint64_t j = 0; j < n; ++j) {
+	            y[i] += A(i, j) * x[j];
+	        }
+	    }
+	    return y;
+	}
+	// -------------- Collapse(2) OpenMP ----------------
+	template <typename T>
+	utils::Vector<T> matvec_omp(const utils::Matrix<T>& A, const utils::Vector<T>& x) {
+	    if (A.cols() != x.size()) {
+	        throw std::runtime_error("matvec: dimension mismatch");
+	    }
+
+	    const uint64_t m = A.rows();
+	    const uint64_t n = A.cols();
+
+	    utils::Vector<T> y(m, T{0});
+
+	    #pragma omp parallel for schedule(static)
 	    for (uint64_t i = 0; i < m; ++i) {
 	        T acc = T{0};
 	        for (uint64_t j = 0; j < n; ++j) {
@@ -28,7 +51,34 @@ namespace numerics{
 	    return y;
 	}

-	// y = x * A, where x is length m and A is (m×n) -> y is length n
+	// -------------- Auto OpenMP ----------------
+	template <typename T>
+	utils::Vector<T> matvec_auto(const utils::Matrix<T>& A,
+	                             const utils::Vector<T>& x) {
+
+	    
+	    uint64_t work = A.rows() * A.cols();
+
+	    bool can_parallel = omp_config::omp_parallel_allowed();
+		#ifdef _OPENMP
+		    int threads = omp_get_max_threads();
+		#else
+		    int threads = 1;
+		#endif
+
+	    if (can_parallel || work > static_cast<uint64_t>(threads) * 4ull) {
+	        return matvec_omp(A,x);
+	    }
+	    else{
+	    	// Safe fallback
+	    	return matvec(A,x);
+	    }
+	    
+	}
+
+// =================================================
+//   y = x * A    (Vector–Matrix product)
+// =================================================
 	template <typename T>
 	utils::Vector<T> vecmat(const utils::Vector<T>& x, const utils::Matrix<T>& A) {
 	    if (x.size() != A.rows()) {
@@ -38,6 +88,26 @@ namespace numerics{
 	    const uint64_t n = A.cols();

 	    utils::Vector<T> y(n, T{0});
+
+	    for (uint64_t j = 0; j < n; ++j) {
+	        for (uint64_t i = 0; i < m; ++i) {
+	            y[j] += x[i] * A(i, j);
+	        }
+	    }
+	    return y;
+	}
+
+	// -------------- Collapse(2) OpenMP ----------------
+	template <typename T>
+	utils::Vector<T> vecmat_omp(const utils::Vector<T>& x, const utils::Matrix<T>& A) {
+	    if (x.size() != A.rows()) {
+	        throw std::runtime_error("vecmat: dimension mismatch");
+	    }
+	    const uint64_t m = A.rows();
+	    const uint64_t n = A.cols();
+
+	    utils::Vector<T> y(n, T{0});
+	    #pragma omp parallel for schedule(static)
 	    for (uint64_t j = 0; j < n; ++j) {
 	        T acc = T{0};
 	        for (uint64_t i = 0; i < m; ++i) {
@@ -48,6 +118,30 @@ namespace numerics{
 	    return y;
 	}

+	// -------------- Auto OpenMP ----------------
+	template <typename T>
+	utils::Vector<T> vecmat_auto(const utils::Vector<T>& x,
+								 const utils::Matrix<T>& A) {
+	    
+	    uint64_t work = A.rows() * A.cols();
+
+	    bool can_parallel = omp_config::omp_parallel_allowed();
+		#ifdef _OPENMP
+		    int threads = omp_get_max_threads();
+		#else
+		    int threads = 1;
+		#endif
+
+	    if (can_parallel || work > static_cast<uint64_t>(threads) * 4ull) {
+	        return vecmat_omp(x,A);
+	    }
+	    else{
+	    	// Safe fallback
+	    	return vecmat(x,A);
+	    }
+	    
+	}
+

 } // namespace numerics

@@ -43,28 +43,6 @@ namespace numerics{
 	}


-
-
-
-
-
-
-
-
-
-
-
 } // namespace numerics

-
-
-
-
-
-
-
-
-
-
-
 #endif // _transpose_n_
@@ -1,11 +0,0 @@
-#ifndef _numerics_n_
-#define _numerics_n_
-
-
-namespace utils{
-
-double random(const double& min, const double& max);
-
-}
-
-#endif // _numerics_n_
@@ -66,6 +66,20 @@ public:

 		bool operator!=(const Vector<T>& a) const { return !(*this == a); }

+		//##################################################
+		//#            VECTOR: nearly_equal_vec            #
+		//##################################################
+
+		bool nearly_equal_vec(const Vector<T>& a, double tol=1e-12) const {
+		    if (a.size() != v.size()) return false;
+		    for (uint64_t i=0;i<a.size();++i) {
+		    	if (std::fabs(a[i]-v[i])>tol) {
+		    		return false;
+		    	}
+		    }
+		    return true;
+		}
+
 		//##################################################
 		//#            VECTOR: Scalar Addition             #
 		//##################################################