Finishing up and starting lu decomp

This commit is contained in:
2025-09-13 21:44:20 +02:00
parent 320436ce98
commit 88087ea6a6
24 changed files with 1502 additions and 699 deletions
+83 -4
View File
@@ -3,10 +3,12 @@
#include "./utils/matrix.h"
#include "./core/omp_config.h"
namespace numerics{
// ---------------- Serial baseline ----------------
template <typename T>
utils::Matrix<T> matmul(const utils::Matrix<T>& A, const utils::Matrix<T>& B){
@@ -19,10 +21,8 @@ namespace numerics{
const uint64_t p = B.cols();
T tmp;
utils::Matrix<T> C(m, n, T{0});
utils::Matrix<T> C(m, p, T{0});
//#pragma omp parallel for collapse(2) schedule(static)
#pragma omp parallel for
for (uint64_t i = 0; i < m; ++i){
for (uint64_t j = 0; j < n; ++j){
tmp = A(i,j);
@@ -34,6 +34,85 @@ namespace numerics{
return C;
}
// ---------------- Rows-only OpenMP ----------------
template <typename T>
utils::Matrix<T> matmul_rows_omp(const utils::Matrix<T>& A,
const utils::Matrix<T>& B) {
if (A.cols() != B.rows()) throw std::runtime_error("matmul_rows_omp: dim mismatch");
const uint64_t m=A.rows(), n=A.cols(), p=B.cols();
utils::Matrix<T> C(m, p, T{0});
#pragma omp parallel for schedule(static)
for (uint64_t i=0;i<m;++i) {
for (uint64_t j=0;j<p;++j) {
T acc=T{0};
for (uint64_t k=0;k<n;++k) {
acc += A(i,k)*B(k,j);
}
C(i,j)=acc;
}
}
return C;
}
// -------------- Collapse(2) OpenMP ----------------
template <typename T>
utils::Matrix<T> matmul_collapse_omp(const utils::Matrix<T>& A,
const utils::Matrix<T>& B) {
if (A.cols() != B.rows()) throw std::runtime_error("matmul_collapse_omp: dim mismatch");
const uint64_t m=A.rows(), n=A.cols(), p=B.cols();
utils::Matrix<T> C(m, p, T{0});
#pragma omp parallel for collapse(2) schedule(static)
for (uint64_t i=0;i<m;++i) {
for (uint64_t j=0;j<p;++j) {
T acc=T{0};
for (uint64_t k=0;k<n;++k){
acc += A(i,k)*B(k,j);
}
C(i,j)=acc;
}
}
return C;
}
// -------------------- Auto selector ---------------------
template <typename T>
utils::Matrix<T> matmul_auto(const utils::Matrix<T>& A,
const utils::Matrix<T>& B) {
const uint64_t m=A.rows(), p=B.cols();
const uint64_t work = m * p;
bool can_parallel = omp_config::omp_parallel_allowed();
#ifdef _OPENMP
int threads = omp_get_max_threads();
#else
int threads = 1;
#endif
// Tiny problems: serial is cheapest.
if (!can_parallel || work < static_cast<uint64_t>(threads)*4ull) {
return matmul(A,B);
}
// Plenty of (i,j) work → collapse(2) is a great default.
else if (work >= 8ull * static_cast<uint64_t>(threads)) {
return matmul_collapse_omp(A,B);
}
// Many rows and very few columns → rows-only cheaper overhead.
else if (m >= static_cast<uint64_t>(threads) && p <= 4) {
return matmul_rows_omp(A,B);
}
else{
// Safe fallback
return matmul(A,B);
}
}