Regulaization

Started on regulaization in Loss.h. I need to refactor the matsum.h since I need a total sum over the matrix. Also matmul needs a elementwise matmul function, which is the next this in the ragulaization
2026-01-03 22:10:50 +01:00
parent 32ba0518fa
commit 48f329feef
17 changed files with 881 additions and 510 deletions
@@ -12,45 +12,51 @@ namespace neural_networks{
 	template <typename T>
 	struct Dense_Layer{

-			utils::Matrix<T> _inputs;
-			utils::Matrix<T> weights;
-			utils::Vector<T> biases;
-			utils::Matrix<T> outputs;
+		T weight_regularizer_l1 = {1e-4};
+		T weight_regularizer_l2 = {1e-4};

-			utils::Matrix<T> dweights;
-			utils::Vector<T> dbiases;
-			utils::Matrix<T> dinputs;
+		T bias_regularizer_l1 = {1e-4};
+		T bias_regularizer_l2 = {1e-4};

-			// Variables for optimizers
-		    utils::Matrix<T> weight_momentums;
-		    utils::Vector<T> bias_momentums;
-		    utils::Matrix<T> weight_cache;
-		    utils::Vector<T> bias_cache;
-			
-			// Default Constructor
-			Dense_Layer() = default;
+		utils::Matrix<T> _inputs;
+		utils::Matrix<T> weights;
+		utils::Vector<T> biases;
+		utils::Matrix<T> outputs;

-			// Constructor
-	    	Dense_Layer(const uint64_t n_inputs, const uint64_t n_neurons){
-	        	
-	        	weights.random(n_inputs, n_neurons, -1, 1);
-	        	biases.resize(n_neurons, T{0});
-	        	
-	        }
+		utils::Matrix<T> dweights;
+		utils::Vector<T> dbiases;
+		utils::Matrix<T> dinputs;

-	        void forward(const utils::Matrix<T>& inputs){
-	        	_inputs = inputs;
-			    outputs = numerics::matadd(numerics::matmul_auto(inputs, weights), biases, "row");
-	        }
+		// Variables for optimizers
+	    utils::Matrix<T> weight_momentums;
+	    utils::Vector<T> bias_momentums;
+	    utils::Matrix<T> weight_cache;
+	    utils::Vector<T> bias_cache;
+		
+		// Default Constructor
+		Dense_Layer() = default;

-	        void backward(const utils::Matrix<T>& dvalues){
-	        	// Gradients on parameters
-	        	dweights  = numerics::matmul(numerics::transpose(_inputs), dvalues);
-	        	dbiases = numerics::matsum(dvalues, "row");
-	        	//Gradient on values
-	        	dinputs = numerics::matmul(dvalues, numerics::transpose(weights));
+		// Constructor
+    	Dense_Layer(const uint64_t n_inputs, const uint64_t n_neurons){
+        	
+        	weights.random(n_inputs, n_neurons, -1, 1);
+        	biases.resize(n_neurons, T{0});
+        	
+        }

-	        }
+        void forward(const utils::Matrix<T>& inputs){
+        	_inputs = inputs;
+		    outputs = numerics::matadd(numerics::matmul_auto(inputs, weights), biases, "row");
+        }
+
+        void backward(const utils::Matrix<T>& dvalues){
+        	// Gradients on parameters
+        	dweights  = numerics::matmul(numerics::transpose(_inputs), dvalues);
+        	dbiases = numerics::matsum(dvalues, "row");
+        	//Gradient on values
+        	dinputs = numerics::matmul(dvalues, numerics::transpose(weights));
+
+        }

 	};

@@ -5,30 +5,65 @@
 #include "./utils/vector.h"
 #include "./utils/matrix.h"

-#include "./numerics/vecmean.h"
+#include "numerics/vecmean.h"
+#include "numerics/matabs.h"
+#include "numerics/matmean.h"

 namespace neural_networks{

 	template <typename Td, typename Ti>
 	struct Loss{

-			utils::Vector<Td> sample_losses;
-			utils::Matrix<Td> dinputs;
-			Td data_loss;
+		utils::Vector<Td> sample_losses;
+		utils::Matrix<Td> dinputs;
+		Td data_loss;
+		Td regularization_losss;

-			virtual utils::Vector<Td> forward(const utils::Matrix<Td>& output, const utils::Matrix<Ti>& y) = 0;
-			virtual void backward(const utils::Matrix<Td>& dvalues, const utils::Matrix<Ti>& y) = 0;
+		virtual utils::Vector<Td> forward(const utils::Matrix<Td>& output, const utils::Matrix<Ti>& y) = 0;
+		virtual void backward(const utils::Matrix<Td>& dvalues, const utils::Matrix<Ti>& y) = 0;

-			Td calculate(const utils::Matrix<Td>& output, const utils::Matrix<Ti>& y){
-				
-				// Calculate sample losses
-				sample_losses = forward(output, y);
+		Td calculate(const utils::Matrix<Td>& output, const utils::Matrix<Ti>& y){
+			
+			// Calculate sample losses
+			sample_losses = forward(output, y);

-				// Calculate mean loss
-				data_loss = numerics::vecmean(sample_losses);
-				return data_loss;
+			// Calculate mean loss
+			data_loss = numerics::vecmean(sample_losses);

+			return data_loss;
+
+		}
+
+		template <typename Layer>
+		Td regularization_loss(const Layer& layer){
+			// 0 by default
+			regularization_losss = 0;
+			
+			// L1 regularization - weights
+			// calculate only when factor greater than 0
+			if (layer.weight_regularizer_l1){
+				regularization_losss += layer.weight_regularizer_l1 * numerics::matsum_coeff(numerics::matabs(layer.weights));
 			}
+
+			// L2 regularization - weights
+			if (layer.weight_regularizer_l2){
+				regularization_losss += layer.weight_regularizer_l2 * numerics::matsum_coeff(numerics::matmul(layer.weights,layer.weights)); // elementwise!
+			}
+
+			// L1 regularization - biases
+			// calculate only when factor greater than 0
+			if (layer.bias_regularizer_l1){
+				regularization_losss += layer.bias_regularizer_l1 * layer.biases.abs().sum();
+			}
+			// L2 regularization - biases
+			if (layer.bias_regularizer_l2){
+				regularization_losss += layer.bias_regularizer_l2 * layer.biases.multiply(layer.biases).sum();
+			}
+
+			return regularization_losss;
+		}
+
+
 	};

 } // end namespace neural_networks
@@ -19,3 +19,5 @@

 #include "optimizers/Optimizer_SGD.h"
 #include "optimizers/Optimizer_Adagrad.h"
+#include "optimizers/Optimizer_RMSprop.h"
+#include "optimizers/Optimizer_Adam.h"
@@ -0,0 +1,134 @@
+#pragma once
+
+#include "./core/omp_config.h"
+
+#include "./utils/vector.h"
+#include "./utils/matrix.h"
+
+#include "./numerics/matmul.h"
+
+#include <math.h>
+
+
+
+namespace neural_networks{
+
+	template <typename T>
+	struct Optimizer_Adam{
+
+			T learning_rate = T{1};
+			T current_learning_rate = learning_rate;
+			T decay = T{0};
+			T epsilon = T{1e-7};
+			T beta_1 = T{0.9};
+			T beta_2 = T{0.999};
+			uint64_t iterations = 0;
+
+			utils::Matrix<T> weight_momentums_corrected;
+			utils::Vector<T> bias_momentums_corrected;
+			utils::Matrix<T> weight_cache_corrected;
+			utils::Vector<T> bias_cache_corrected;
+		
+			// Default Constructor
+			Optimizer_Adam() = default;
+
+			// Constructor
+	    	explicit Optimizer_Adam(const T lr, const T lr_decay, const T epsilons, const T beta1, const T beta2): 
+	    							learning_rate(lr), 
+	    							current_learning_rate{lr}, 
+	    							decay(lr_decay), 
+	    							epsilon(epsilons),
+	    							beta_1(beta1),
+	    							beta_2(beta2) {}
+
+	    	void pre_update_params(){
+	    		if(decay){
+	    			current_learning_rate = learning_rate * (T{1}/(T{1}+(decay*iterations)));
+	    			//std::cout << current_learning_rate << std::endl;
+	    		}
+	    	}
+
+	    	template <typename Layer>
+	        void update_params(Layer& layer){
+
+        		// if layer does not contain cache arrays, create them filled with zeros.
+        		if ((layer.weight_cache.rows() != layer.weights.rows()) || (layer.weight_cache.cols() != layer.weights.cols())){
+        			layer.weight_momentums.resize(layer.weights.rows(), layer.weights.cols(), T{0});
+        			layer.weight_cache.resize(layer.weights.rows(), layer.weights.cols(), T{0});
+        		}
+        		if (layer.bias_cache.size() != layer.biases.size()){
+        			layer.bias_momentums.resize(layer.biases.size(), T{0});
+        			layer.bias_cache.resize(layer.biases.size(), T{0});
+        		}
+
+        		// Update momentum with current gradients
+        		for (uint64_t i = 0; i < layer.weights.rows(); ++i){
+        			for (uint64_t j = 0; j < layer.weights.cols(); ++j){
+        				layer.weight_momentums(i,j) = (beta_1 * layer.weight_momentums(i,j)) + ((T{1} - beta_1) * layer.dweights(i,j));
+        			}
+        		}
+
+        		for (uint64_t i = 0; i < layer.biases.size(); ++i){
+        			layer.bias_momentums[i] = (beta_1 * layer.bias_momentums[i]) + ((T{1} - beta_1) * layer.dbiases[i]);
+        		}
+
+
+        		// Get corrected momentum
+        		// interation is 0 at first pass
+        		// and we need to start with 1 here
+        		weight_momentums_corrected.resize(layer.weights.rows(),layer.weights.cols()); // can be optimized out later
+        		for (uint64_t i = 0; i < layer.weights.rows(); ++i){
+        			for (uint64_t j = 0; j < layer.weights.cols(); ++j){
+        				weight_momentums_corrected(i,j) = layer.weight_momentums(i,j) / (T{1} - std::pow(beta_1, iterations+1));
+        			}
+        		}
+        		bias_momentums_corrected.resize(layer.biases.size());  // can be optimized out later
+        		for (uint64_t i = 0; i < layer.biases.size(); ++i){
+        			bias_momentums_corrected[i] = layer.bias_momentums[i] / (T{1} - std::pow(beta_1, iterations+1));
+        		}
+
+
+        		// Update cache with squared current gradients
+        		for (uint64_t i = 0; i < layer.weights.rows(); ++i){
+        			for (uint64_t j = 0; j < layer.weights.cols(); ++j){
+        				layer.weight_cache(i,j) = (beta_2*layer.weight_cache(i,j)) + ((T{1}-beta_2) * (layer.dweights(i,j)*layer.dweights(i,j)));
+        			}
+        		}
+        		
+        		for (uint64_t i = 0; i < layer.biases.size(); ++i){ // can maybe be included when updating weights (saves time)
+        			layer.bias_cache[i] = (beta_2*layer.bias_cache[i]) + ((T{1}-beta_2) * (layer.dbiases[i]*layer.dbiases[i]));
+        		}
+
+        		// Get corrected cache
+        		// interation is 0 at first pass
+        		// and we need to start with 1 here
+        		weight_cache_corrected.resize(layer.weights.rows(),layer.weights.cols()); // can be optimized out later
+        		for (uint64_t i = 0; i < layer.weights.rows(); ++i){
+        			for (uint64_t j = 0; j < layer.weights.cols(); ++j){
+        				weight_cache_corrected(i,j) = layer.weight_cache(i,j) / (T{1} - std::pow(beta_2, iterations+1));
+        			}
+        		}
+        		bias_cache_corrected.resize(layer.biases.size());  // can be optimized out later
+        		for (uint64_t i = 0; i < layer.biases.size(); ++i){
+        			bias_cache_corrected[i] = layer.bias_cache[i] / (T{1} - std::pow(beta_2, iterations+1));
+        		}
+
+
+        		// Vanilla SGD parameter update + normalization with squared rooted cache
+	        	for (uint64_t i = 0; i < layer.weights.rows(); ++i){
+	        		for (uint64_t j = 0; j < layer.weights.cols(); ++j){
+	        			layer.weights(i,j) -= (current_learning_rate*weight_momentums_corrected(i,j)) / (std::sqrt(weight_cache_corrected(i,j)) + epsilon);
+	        		}
+	        	}
+	        	for (uint64_t i = 0; i < layer.biases.size(); ++i){
+	        		layer.biases[i] -= (current_learning_rate*bias_momentums_corrected[i]) / (std::sqrt(bias_cache_corrected[i]) + epsilon);
+	        	}
+	        }
+
+	    	void post_update_params(){
+	    		iterations++;
+	    	}
+
+	};
+
+} // end namespace neural_networks
@@ -0,0 +1,81 @@
+#pragma once
+
+#include "./core/omp_config.h"
+
+#include "./utils/vector.h"
+#include "./utils/matrix.h"
+
+#include "./numerics/matmul.h"
+
+#include <math.h>
+
+
+
+
+namespace neural_networks{
+
+	template <typename T>
+	struct Optimizer_RMSprop{
+
+			T learning_rate = T{1};
+			T current_learning_rate = learning_rate;
+			T decay = T{0};
+			T epsilon = T{1e-7};
+			T rho = T{0.9};
+			uint64_t iterations = 0;
+		
+			// Default Constructor
+			Optimizer_RMSprop() = default;
+
+			// Constructor
+	    	explicit Optimizer_RMSprop(const T lr, const T lr_decay, const T epsilons, const T rhos): learning_rate(lr), current_learning_rate{lr}, decay(lr_decay), epsilon(epsilons), rho(rhos) {}
+
+	    	void pre_update_params(){
+	    		if(decay){
+	    			current_learning_rate = learning_rate * (T{1}/(T{1}+(decay*iterations)));
+	    			//std::cout << current_learning_rate << std::endl;
+	    		}
+	    	}
+
+	    	template <typename Layer>
+	        void update_params(Layer& layer){
+
+
+
+        		// if layer does not contain cache arrays, create them filled with zeros.
+        		if ((layer.weight_cache.rows() != layer.weights.rows()) || (layer.weight_cache.cols() != layer.weights.cols())){
+        			layer.weight_cache.resize(layer.weights.rows(), layer.weights.cols(), T{0});
+        		}
+        		if (layer.bias_cache.size() != layer.biases.size()){
+        			layer.bias_cache.resize(layer.biases.size(), T{0});
+        		}
+
+        		// Update cache with squared current gradients
+        		for (uint64_t i = 0; i < layer.weights.rows(); ++i){
+        			for (uint64_t j = 0; j < layer.weights.cols(); ++j){
+        				layer.weight_cache(i,j) = (rho*layer.weight_cache(i,j)) + ((T{1}-rho) * (layer.dweights(i,j)*layer.dweights(i,j)));
+        			}
+        		}
+        		
+        		for (uint64_t i = 0; i < layer.biases.size(); ++i){ // can maybe be included when updating weights (saves time)
+        			layer.bias_cache[i] = (rho*layer.bias_cache[i]) + ((T{1}-rho) * (layer.dbiases[i]*layer.dbiases[i]));
+        		}
+
+        		// Vanilla SGD parameter update + normalization with squared rooted cache
+	        	for (uint64_t i = 0; i < layer.weights.rows(); ++i){
+	        		for (uint64_t j = 0; j < layer.weights.cols(); ++j){
+	        			layer.weights(i,j) -= (current_learning_rate*layer.dweights(i,j)) / (std::sqrt(layer.weight_cache(i,j)) + epsilon);
+	        		}
+	        	}
+	        	for (uint64_t i = 0; i < layer.biases.size(); ++i){
+	        		layer.biases[i] -= (current_learning_rate*layer.dbiases[i]) / (std::sqrt(layer.bias_cache[i]) + epsilon);
+	        	}
+	        }
+
+	    	void post_update_params(){
+	    		iterations++;
+	    	}
+
+	};
+
+} // end namespace neural_networks
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "./numerics/abs.h"
+#include "./utils/matrix.h"
+
+namespace numerics{
+	
+	template <typename T>
+	void inplace_matabs(utils::Matrix<T>& A){
+	
+		for (uint64_t i = 0; i < A.rows(); ++i){
+			for (uint64_t j = 0; j < A.cols(); ++j){
+				A(i,j) = numerics::abs(A(i,j));
+			}
+		}
+	}
+
+
+	template <typename T>
+	utils::Matrix<T> matabs(const utils::Matrix<T>& A){
+		utils::Matrix<T> B = A;
+		inplace_matabs(B);
+		return B;
+	}
+
+
+
+} // namespace numerics
+
@@ -1,5 +1,4 @@
-#ifndef _mean_n_
-#define _mean_n_
+#pragma once

 #include "./utils/vector.h"
 #include "./utils/matrix.h"
@@ -8,7 +7,7 @@
 namespace numerics{
 	
 	template <typename T>
-	T matmean(utils::Matrix<T>& A) {
+	T matmean(const utils::Matrix<T>& A) {

 		T mean(T{0});
 	    
@@ -27,7 +26,7 @@ namespace numerics{


 	template <typename T>
-	void inplace_matmean_row(utils::Matrix<T>& A, utils::Vector<T>& b) {
+	void inplace_matmean_row(const utils::Matrix<T>& A, utils::Vector<T>& b) {
    
 	    const uint64_t rows = A.rows();
 	    const uint64_t cols = A.cols();
@@ -40,12 +39,12 @@ namespace numerics{
    		for (uint64_t i = 0; i < rows; ++i){
    			b[j] += A(i, j);
    		}
-    		b[j] =/ static_cast<T>(rows);
+    		b[j] /= static_cast<T>(rows);
    	}
 	}

 	template <typename T>
-	void inplace_matmean_cols(utils::Matrix<T>& A) {
+	void inplace_matmean_cols(const utils::Matrix<T>& A, utils::Vector<T>& b) {

 	    const uint64_t rows = A.rows();
 	    const uint64_t cols = A.cols();
@@ -58,15 +57,15 @@ namespace numerics{
    		for (uint64_t j = 0; j < cols; ++j){
    			b[i] += A(i, j);
    		}
-    		b[j] =/ static_cast<T>(cols);
+    		b[i] /= static_cast<T>(cols);
    	}
 	}


 	template <typename T>
-	utils::Vector<T> matmean_row(utils::Matrix<T>& A) {
+	utils::Vector<T> matmean_row(const utils::Matrix<T>& A) {
    
-	    utils:Vector<T> b(A.rows(), T{0});
+	    utils::Vector<T> b(A.rows(), T{0});

 	    inplace_matmean_row(A, b);
 	    
@@ -74,9 +73,9 @@ namespace numerics{
 	}

 	template <typename T>
-	utils::Vector<T> matmean_col(utils::Matrix<T>& A) {
+	utils::Vector<T> matmean_col(const utils::Matrix<T>& A) {

-	    utils:Vector<T> b(A.cols(), T{0});
+	    utils::Vector<T> b(A.cols(), T{0});

 	    inplace_matmean_cols(A, b);
 	    
@@ -84,5 +83,3 @@ namespace numerics{
 	}

 } // namespace numerics
-
-#endif // _mean_n_
@@ -7,6 +7,20 @@

 namespace numerics{
 	
+	template <typename T>
+	T matsum_coeff(const utils::Matrix<T>& A) {
+
+		T b;
+
+		for (uint64_t i = 0; i < A.cols(); ++i){
+			for (uint64_t j = 0; j < A.rows(); ++j){
+				b += A(i, j); 
+			}
+		}
+		return b;
+	}
+
+
 	template <typename T>
 	utils::Vector<T> matsum(const utils::Matrix<T>& A, std::string method) {

@@ -16,6 +16,7 @@
 #include "./numerics/matmul.h"
 #include "./numerics/matscalar.h"
 #include "./numerics/matmax.h"
+#include "./numerics/matabs.h"
 #include "./numerics/matdiv.h"
 #include "./numerics/matvec.h"
 #include "./numerics/matadd.h"
@@ -400,12 +400,29 @@ public:
 			}
 			return result;
 		}
+
+		//############################################
+		//#            VECTOR: Abs                   #
+		//############################################
+		Vector<T> abs()const{
+			Vector<T> result = *this;
+			const uint64_t n = v.size();
+			for (uint64_t i = 0; i < n; ++i){
+				if (v[i]<T{0}){
+					result[i] = -v[i];
+				}
+			}
+			return result;
+		}
+
+
 		//############################################
 		//#            VECTOR: Norm                  #
 		//############################################
 		T norm() const{
 			return static_cast<T>(std::sqrt(this->dot(*this)));
 		}
+
 		//############################################
 		//#            VECTOR: Normalize             #
 		//############################################