Finishing up and starting lu decomp

This commit is contained in:
2025-09-13 21:44:20 +02:00
parent 320436ce98
commit 88087ea6a6
24 changed files with 1502 additions and 699 deletions
+42 -23
View File
@@ -4,31 +4,50 @@
#include <omp.h>
// Configure OpenMP behavior at runtime.
inline void omp_configure(int max_active_levels,
bool dynamic_threads,
const std::vector<int>& threads_per_level = {},
bool bind_close = true)
{
// 1) Allow nested parallel regions (levels of teams)
// Example: outer #pragma omp parallel ... { inner #pragma omp parallel ... }
omp_set_max_active_levels(max_active_levels); // 1 = only top-level; 2+ enables nesting
namespace omp_config{
// 2) Let the runtime shrink/grow thread counts if it thinks it should
// (helps avoid oversubscription when you accidentally ask for too many threads)
omp_set_dynamic(dynamic_threads ? 1 : 0);
// Configure OpenMP behavior at runtime.
inline void omp_configure(int max_active_levels,
bool dynamic_threads,
const std::vector<int>& threads_per_level = {},
bool bind_close = true)
{
// 1) Allow nested parallel regions (levels of teams)
// Example: outer #pragma omp parallel ... { inner #pragma omp parallel ... }
omp_set_max_active_levels(max_active_levels); // 1 = only top-level; 2+ enables nesting
// 3) Thread binding (keep threads near their cores) is controlled via env vars,
// so here we just *recommend* a good default (see below). You *can* setenv()
// in code, but its cleaner to do it outside the program.
(void)bind_close; // documented below in env var section
// 2) Let the runtime shrink/grow thread counts if it thinks it should
// (helps avoid oversubscription when you accidentally ask for too many threads)
omp_set_dynamic(dynamic_threads ? 1 : 0);
// 4) Top-level default thread count (inner levels are usually set per region)
if (!threads_per_level.empty()) {
omp_set_num_threads(threads_per_level[0]); // e.g. 16 for the outermost team
// Inner levels:
// - Use num_threads(threads_per_level[L]) on the inner #pragma omp parallel
// - or set OMP_NUM_THREADS="outer,inner,inner2" as an environment variable
// 3) Thread binding (keep threads near their cores) is controlled via env vars,
// so here we just *recommend* a good default (see below). You *can* setenv()
// in code, but its cleaner to do it outside the program.
(void)bind_close; // documented below in env var section
// 4) Top-level default thread count (inner levels are usually set per region)
if (!threads_per_level.empty()) {
omp_set_num_threads(threads_per_level[0]); // e.g. 16 for the outermost team
// Inner levels:
// - Use num_threads(threads_per_level[L]) on the inner #pragma omp parallel
// - or set OMP_NUM_THREADS="outer,inner,inner2" as an environment variable
}
}
}
// ---------- Helper: may we create another team? ----------
inline bool omp_parallel_allowed() {
#ifdef _OPENMP
// If were not in parallel, we can spawn.
if (!omp_in_parallel()) return true;
// Already inside parallel: allow only if nesting is enabled and not at limit.
int level = omp_get_active_level(); // 0 outside parallel, 1 inside, ...
int maxlv = omp_get_max_active_levels(); // user/runtime cap
return static_cast<bool>(level < maxlv);
#else
return false; // no OpenMP → no extra teams
#endif
}
} // namespace omp_config
View File
+4
View File
@@ -0,0 +1,4 @@
#pragma once
#include "./decomp/lu.h"
+61
View File
@@ -0,0 +1,61 @@
#pragma once
#include "./utils/vector.h"
#include "./utils/matrix.h"
namespace decomp{
// Stores PA = LU with partial pivoting (row permutations).
template <typename T>
struct LU{
utils::Matrix<T> LUmat; // combined L (unit diagonal implied) and U
std::vector<uint64_t> piv; // pivot indices (row permutations)
bool singular= false; // set true if zero (or near-zero) pivots encountered
LU() = default;
explicit LU(const utils::Matrix<T>& A) {
factor(A);
}
void factor(const utils::Matrix<T>&A){
const uint64_t rows = A.rows();
if (rows != A.cols()){
throw std::runtime_error("LU: factor non-square");
}
if (rows == 0){
LUmat = A;
piv.clear();
singular = false;
return;
}
T big, tmp;
LUmat = A;
piv.resize(rows); // piv stores the implicit scaling of each row.
//double d = 1.0; // No row interchanges yet.
for (uint64_t i = 0; i < rows; ++i){ // Loop over rows to get the implicit scaling information.
big = T{0};
for (uint64_t j = 0; j < rows; ++j){
tmp=std::abs(LUmat(i,j));
if (tmp > big){
big = tmp;
}
}
if (big == T{0}){
throw std::runtime_error("Singular matrix in LU.factor()");
}
}
}
}; // struct LU
typedef LU<float> LUf;
typedef LU<double> LUd;
} // namespace decomp
+17 -78
View File
@@ -5,100 +5,39 @@
#include "./utils/vector.h"
#include "./utils/matrix.h"
#include "./numerics/inverse/inverse_gauss_jordan.h"
#include "./numerics/inverse/inverse_lu.h"
#include <omp.h>
namespace numerics{
template <typename T>
void inplace_inverse(utils::Matrix<T>& A, std::string method = "Gauss-Jordan"){
if (A.rows() != A.cols()) {
throw std::runtime_error("inplace_inverse: non-square matrix");
}
if (method == "Gauss-Jordan"){
utils::Matrix<T> B(A.rows(),A.cols(), T{0});
uint64_t icol{0}, irow{0}, rows{A.rows()}, cols{A.cols()};
double big, dum, pivinv, temp;
utils::Vi indxc(rows,0), indxr(rows,0), ipiv(rows,0);
//for (uint64_t j = 0; j < N; ++j){ ipiv[j] = 0;}
for (uint64_t i = 0; i < rows; i++){
big = 0.0;
for (uint64_t j = 0; j < rows; j++){
if (ipiv[j] != 1){
for (uint64_t k = 0; k < rows; k++){
if (ipiv[k] == 0){
if (abs(A(j,k)) >= big){
big = abs(A(j,k));
irow = j;
icol = k;
}
}
}
}
}
ipiv[icol]++;
if (irow != icol){
for (uint64_t l = 0; l < rows; l++){ // SWAP
temp = A(irow,l);
A(irow,l) = A(icol,l);
A(icol,l) = temp;
}
for (uint64_t l = 0; l < cols; l++){ // SWAP temp matrix
temp = B(irow,l);
B(irow,l) = B(icol,l);
B(icol,l) = temp;
}
}
indxr[i] = irow;
indxc[i] = icol;
if (A(icol,icol) == 0.0){
throw std::runtime_error("utill:inplace_inverse('Gauss-Jordan' - Singular Matrix");
}
pivinv= 1.0/A(icol,icol);
A(icol,icol)=1.0;
for (uint64_t l = 0; l < rows; l++){
A(icol,l) *= pivinv;
}
for (uint64_t l = 0; l < cols; l++){
B(icol,l) *= pivinv;
}
for (uint64_t ll = 0; ll < rows; ll++){
if (ll != icol){
dum = A(ll,icol);
A(ll,icol) = 0;
for (uint64_t l = 0; l < rows; l++){
A(ll,l) -= A(icol,l)*dum;
}
for (uint64_t l = 0; l < rows; l++){
B(ll,l) -= B(icol,l)*dum;
}
}
}
}
//m = temp_m;
for (int64_t l = rows-1; l >= 0; l--){
if (indxr[l] != indxc[l]){
for (uint64_t k = 0; k < rows; k++){
temp = A(k,indxr[l]);
A(k,indxr[l]) = A(k,indxc[l]);
A(k,indxc[l]) = temp;
}
}
}
}
inverse_gj(A);
}
else{
throw std::runtime_error("numerics::inplace_inverse(" + method + ") - Not implemented yet \r \nImplemented: 'Gauss-Jordan',");
}
}
}
template <typename T>
utils::Matrix<T> inverse(utils::Matrix<T>& A, std::string method = "Gauss-Jordan"){
utils::Matrix<T> B = A;
inplace_inverse(B, method);
return B;
}
@@ -0,0 +1,94 @@
#ifndef _inverse_gj_n_
#define _inverse_gj_n_
#include "./utils/vector.h"
#include "./utils/matrix.h"
#include <omp.h>
namespace numerics{
template <typename T>
void inverse_gj(utils::Matrix<T>& A){
utils::Matrix<T> B(A.rows(),A.cols(), T{0});
uint64_t icol{0}, irow{0}, rows{A.rows()}, cols{A.cols()};
double big, dum, pivinv, temp;
utils::Vi indxc(rows,0), indxr(rows,0), ipiv(rows,0);
//for (uint64_t j = 0; j < N; ++j){ ipiv[j] = 0;}
for (uint64_t i = 0; i < rows; i++){
big = 0.0;
for (uint64_t j = 0; j < rows; j++){
if (ipiv[j] != 1){
for (uint64_t k = 0; k < rows; k++){
if (ipiv[k] == 0){
if (abs(A(j,k)) >= big){
big = abs(A(j,k));
irow = j;
icol = k;
}
}
}
}
}
ipiv[icol]++;
if (irow != icol){
for (uint64_t l = 0; l < rows; l++){ // SWAP
temp = A(irow,l);
A(irow,l) = A(icol,l);
A(icol,l) = temp;
}
for (uint64_t l = 0; l < cols; l++){ // SWAP temp matrix
temp = B(irow,l);
B(irow,l) = B(icol,l);
B(icol,l) = temp;
}
}
indxr[i] = irow;
indxc[i] = icol;
if (A(icol,icol) == 0.0){
throw std::runtime_error("utill:inplace_inverse('Gauss-Jordan' - Singular Matrix");
}
pivinv= 1.0/A(icol,icol);
A(icol,icol)=1.0;
for (uint64_t l = 0; l < rows; l++){
A(icol,l) *= pivinv;
}
for (uint64_t l = 0; l < cols; l++){
B(icol,l) *= pivinv;
}
for (uint64_t ll = 0; ll < rows; ll++){
if (ll != icol){
dum = A(ll,icol);
A(ll,icol) = 0;
for (uint64_t l = 0; l < rows; l++){
A(ll,l) -= A(icol,l)*dum;
}
for (uint64_t l = 0; l < rows; l++){
B(ll,l) -= B(icol,l)*dum;
}
}
}
}
//m = temp_m;
for (int64_t l = rows-1; l >= 0; l--){
if (indxr[l] != indxc[l]){
for (uint64_t k = 0; k < rows; k++){
temp = A(k,indxr[l]);
A(k,indxr[l]) = A(k,indxc[l]);
A(k,indxc[l]) = temp;
}
}
}
}
} // namespace numerics
#endif // _inverse_gj_n_
+83 -4
View File
@@ -3,10 +3,12 @@
#include "./utils/matrix.h"
#include "./core/omp_config.h"
namespace numerics{
// ---------------- Serial baseline ----------------
template <typename T>
utils::Matrix<T> matmul(const utils::Matrix<T>& A, const utils::Matrix<T>& B){
@@ -19,10 +21,8 @@ namespace numerics{
const uint64_t p = B.cols();
T tmp;
utils::Matrix<T> C(m, n, T{0});
utils::Matrix<T> C(m, p, T{0});
//#pragma omp parallel for collapse(2) schedule(static)
#pragma omp parallel for
for (uint64_t i = 0; i < m; ++i){
for (uint64_t j = 0; j < n; ++j){
tmp = A(i,j);
@@ -34,6 +34,85 @@ namespace numerics{
return C;
}
// ---------------- Rows-only OpenMP ----------------
template <typename T>
utils::Matrix<T> matmul_rows_omp(const utils::Matrix<T>& A,
const utils::Matrix<T>& B) {
if (A.cols() != B.rows()) throw std::runtime_error("matmul_rows_omp: dim mismatch");
const uint64_t m=A.rows(), n=A.cols(), p=B.cols();
utils::Matrix<T> C(m, p, T{0});
#pragma omp parallel for schedule(static)
for (uint64_t i=0;i<m;++i) {
for (uint64_t j=0;j<p;++j) {
T acc=T{0};
for (uint64_t k=0;k<n;++k) {
acc += A(i,k)*B(k,j);
}
C(i,j)=acc;
}
}
return C;
}
// -------------- Collapse(2) OpenMP ----------------
template <typename T>
utils::Matrix<T> matmul_collapse_omp(const utils::Matrix<T>& A,
const utils::Matrix<T>& B) {
if (A.cols() != B.rows()) throw std::runtime_error("matmul_collapse_omp: dim mismatch");
const uint64_t m=A.rows(), n=A.cols(), p=B.cols();
utils::Matrix<T> C(m, p, T{0});
#pragma omp parallel for collapse(2) schedule(static)
for (uint64_t i=0;i<m;++i) {
for (uint64_t j=0;j<p;++j) {
T acc=T{0};
for (uint64_t k=0;k<n;++k){
acc += A(i,k)*B(k,j);
}
C(i,j)=acc;
}
}
return C;
}
// -------------------- Auto selector ---------------------
template <typename T>
utils::Matrix<T> matmul_auto(const utils::Matrix<T>& A,
const utils::Matrix<T>& B) {
const uint64_t m=A.rows(), p=B.cols();
const uint64_t work = m * p;
bool can_parallel = omp_config::omp_parallel_allowed();
#ifdef _OPENMP
int threads = omp_get_max_threads();
#else
int threads = 1;
#endif
// Tiny problems: serial is cheapest.
if (!can_parallel || work < static_cast<uint64_t>(threads)*4ull) {
return matmul(A,B);
}
// Plenty of (i,j) work → collapse(2) is a great default.
else if (work >= 8ull * static_cast<uint64_t>(threads)) {
return matmul_collapse_omp(A,B);
}
// Many rows and very few columns → rows-only cheaper overhead.
else if (m >= static_cast<uint64_t>(threads) && p <= 4) {
return matmul_rows_omp(A,B);
}
else{
// Safe fallback
return matmul(A,B);
}
}
+97 -3
View File
@@ -3,11 +3,13 @@
#include "./utils/matrix.h"
#include "./core/omp_config.h"
namespace numerics{
// y = A * x, where A is (m×n) and x is length n and y is length m
// =================================================
// y = A * x (MatrixVector product)
// =================================================
template <typename T>
utils::Vector<T> matvec(const utils::Matrix<T>& A, const utils::Vector<T>& x) {
if (A.cols() != x.size()) {
@@ -18,6 +20,27 @@ namespace numerics{
const uint64_t n = A.cols();
utils::Vector<T> y(m, T{0});
for (uint64_t i = 0; i < m; ++i) {
for (uint64_t j = 0; j < n; ++j) {
y[i] += A(i, j) * x[j];
}
}
return y;
}
// -------------- Collapse(2) OpenMP ----------------
template <typename T>
utils::Vector<T> matvec_omp(const utils::Matrix<T>& A, const utils::Vector<T>& x) {
if (A.cols() != x.size()) {
throw std::runtime_error("matvec: dimension mismatch");
}
const uint64_t m = A.rows();
const uint64_t n = A.cols();
utils::Vector<T> y(m, T{0});
#pragma omp parallel for schedule(static)
for (uint64_t i = 0; i < m; ++i) {
T acc = T{0};
for (uint64_t j = 0; j < n; ++j) {
@@ -28,7 +51,34 @@ namespace numerics{
return y;
}
// y = x * A, where x is length m and A is (m×n) -> y is length n
// -------------- Auto OpenMP ----------------
template <typename T>
utils::Vector<T> matvec_auto(const utils::Matrix<T>& A,
const utils::Vector<T>& x) {
uint64_t work = A.rows() * A.cols();
bool can_parallel = omp_config::omp_parallel_allowed();
#ifdef _OPENMP
int threads = omp_get_max_threads();
#else
int threads = 1;
#endif
if (can_parallel || work > static_cast<uint64_t>(threads) * 4ull) {
return matvec_omp(A,x);
}
else{
// Safe fallback
return matvec(A,x);
}
}
// =================================================
// y = x * A (VectorMatrix product)
// =================================================
template <typename T>
utils::Vector<T> vecmat(const utils::Vector<T>& x, const utils::Matrix<T>& A) {
if (x.size() != A.rows()) {
@@ -38,6 +88,26 @@ namespace numerics{
const uint64_t n = A.cols();
utils::Vector<T> y(n, T{0});
for (uint64_t j = 0; j < n; ++j) {
for (uint64_t i = 0; i < m; ++i) {
y[j] += x[i] * A(i, j);
}
}
return y;
}
// -------------- Collapse(2) OpenMP ----------------
template <typename T>
utils::Vector<T> vecmat_omp(const utils::Vector<T>& x, const utils::Matrix<T>& A) {
if (x.size() != A.rows()) {
throw std::runtime_error("vecmat: dimension mismatch");
}
const uint64_t m = A.rows();
const uint64_t n = A.cols();
utils::Vector<T> y(n, T{0});
#pragma omp parallel for schedule(static)
for (uint64_t j = 0; j < n; ++j) {
T acc = T{0};
for (uint64_t i = 0; i < m; ++i) {
@@ -48,6 +118,30 @@ namespace numerics{
return y;
}
// -------------- Auto OpenMP ----------------
template <typename T>
utils::Vector<T> vecmat_auto(const utils::Vector<T>& x,
const utils::Matrix<T>& A) {
uint64_t work = A.rows() * A.cols();
bool can_parallel = omp_config::omp_parallel_allowed();
#ifdef _OPENMP
int threads = omp_get_max_threads();
#else
int threads = 1;
#endif
if (can_parallel || work > static_cast<uint64_t>(threads) * 4ull) {
return vecmat_omp(x,A);
}
else{
// Safe fallback
return vecmat(x,A);
}
}
} // namespace numerics
-22
View File
@@ -43,28 +43,6 @@ namespace numerics{
}
} // namespace numerics
#endif // _transpose_n_
-11
View File
@@ -1,11 +0,0 @@
#ifndef _numerics_n_
#define _numerics_n_
namespace utils{
double random(const double& min, const double& max);
}
#endif // _numerics_n_
+14
View File
@@ -66,6 +66,20 @@ public:
bool operator!=(const Vector<T>& a) const { return !(*this == a); }
//##################################################
//# VECTOR: nearly_equal_vec #
//##################################################
bool nearly_equal_vec(const Vector<T>& a, double tol=1e-12) const {
if (a.size() != v.size()) return false;
for (uint64_t i=0;i<a.size();++i) {
if (std::fabs(a[i]-v[i])>tol) {
return false;
}
}
return true;
}
//##################################################
//# VECTOR: Scalar Addition #
//##################################################