Optimizers - Part 1

Done with SGD and Adagrad, still need to optimize them but they work.
2026-01-01 19:23:48 +01:00
parent bd2edea8ef
commit e5f8c91be4
12 changed files with 299 additions and 133 deletions
@@ -0,0 +1,80 @@
+#pragma once
+
+#include "./core/omp_config.h"
+
+#include "./utils/vector.h"
+#include "./utils/matrix.h"
+
+#include "./numerics/matmul.h"
+
+#include <math.h>
+
+
+
+
+namespace neural_networks{
+
+	template <typename T>
+	struct Optimizer_Adagrad{
+
+			T learning_rate = T{1};
+			T current_learning_rate = learning_rate;
+			T decay = T{0};
+			T epsilon = T{1e-7};
+			uint64_t iterations = 0;
+		
+			// Default Constructor
+			Optimizer_Adagrad() = default;
+
+			// Constructor
+	    	explicit Optimizer_Adagrad(const T lr, const T lr_decay, const T epsilons): learning_rate(lr), current_learning_rate{lr}, decay(lr_decay), epsilon(epsilons) {}
+
+	    	void pre_update_params(){
+	    		if(decay){
+	    			current_learning_rate = learning_rate * (T{1}/(T{1}+(decay*iterations)));
+	    			//std::cout << current_learning_rate << std::endl;
+	    		}
+	    	}
+
+	    	template <typename Layer>
+	        void update_params(Layer& layer){
+
+
+
+        		// if layer does not contain cache arrays, create them filled with zeros.
+        		if ((layer.weight_cache.rows() != layer.weights.rows()) || (layer.weight_cache.cols() != layer.weights.cols())){
+        			layer.weight_cache.resize(layer.weights.rows(), layer.weights.cols(), T{0});
+        		}
+        		if (layer.bias_cache.size() != layer.biases.size()){
+        			layer.bias_cache.resize(layer.biases.size(), T{0});
+        		}
+
+        		// Update cache with squared current gradients
+        		for (uint64_t i = 0; i < layer.weights.rows(); ++i){
+        			for (uint64_t j = 0; j < layer.weights.cols(); ++j){
+        				layer.weight_cache(i,j) = layer.dweights(i,j)*layer.dweights(i,j);
+        			}
+        		}
+        		
+        		for (uint64_t i = 0; i < layer.biases.size(); ++i){ // can maybe be included when updating weights (saves time)
+        			layer.bias_cache[i] = layer.dbiases[i]*layer.dbiases[i];
+        		}
+
+        		// Vanilla SGD parameter update + normalization with squared rooted cache
+	        	for (uint64_t i = 0; i < layer.weights.rows(); ++i){
+	        		for (uint64_t j = 0; j < layer.weights.cols(); ++j){
+	        			layer.weights(i,j) -= (current_learning_rate*layer.dweights(i,j)) / (std::sqrt(layer.weight_cache(i,j)) + epsilon);
+	        		}
+	        	}
+	        	for (uint64_t i = 0; i < layer.biases.size(); ++i){
+	        		layer.biases[i] -= (current_learning_rate*layer.dbiases[i]) / (std::sqrt(layer.bias_cache[i]) + epsilon);
+	        	}
+	        }
+
+	    	void post_update_params(){
+	    		iterations++;
+	    	}
+
+	};
+
+} // end namespace neural_networks
@@ -0,0 +1,97 @@
+#pragma once
+
+#include "./core/omp_config.h"
+
+#include "./utils/vector.h"
+#include "./utils/matrix.h"
+
+#include "./numerics/matmul.h"
+
+
+
+
+namespace neural_networks{
+
+	template <typename T>
+	struct Optimizer_SGD{
+
+			T learning_rate = T{1};
+			T current_learning_rate = learning_rate;
+			T decay = T{0};
+			T momentum = T{0};
+			uint64_t iterations = 0;
+
+			utils::Matrix<T> weight_updates;
+			utils::Vector<T> bias_updates;
+			
+			// Default Constructor
+			Optimizer_SGD() = default;
+
+			// Constructor
+	    	explicit Optimizer_SGD(const T lr, const T lr_decay, const T momentums): learning_rate(lr), current_learning_rate{lr}, decay(lr_decay), momentum(momentums) {}
+
+	    	void pre_update_params(){
+	    		if(decay){
+	    			current_learning_rate = learning_rate * (T{1}/(T{1}+(decay*iterations)));
+	    			//std::cout << current_learning_rate << std::endl;
+	    		}
+	    	}
+
+	    	template <typename Layer>
+	        void update_params(Layer& layer){
+
+
+	        	// if we use momentum
+	        	if(momentum){
+	        		// if layer does not contain momentum arrays, create them filled with zeros.
+	        		if ((layer.weight_momentums.rows() != layer.weights.rows()) || (layer.weight_momentums.cols() != layer.weights.cols())){
+	        			layer.weight_momentums.resize(layer.weights.rows(), layer.weights.cols(), T{0});
+	        		}
+	        		if (layer.bias_momentums.size() != layer.biases.size()){
+	        			layer.bias_momentums.resize(layer.biases.size(), T{0});
+	        		}
+	        		// Build weight updates with momentum - take previous updates,
+	        		// multiplied by retain factor and update with current gradients
+	        		weight_updates.resize(layer.weights.rows(),layer.weights.cols()); // can be optimized out later
+	        		for (uint64_t i = 0; i < layer.weights.rows(); ++i){
+	        			for (uint64_t j = 0; j < layer.weights.cols(); ++j){
+	        				weight_updates(i,j) = (momentum*layer.weight_momentums(i,j)) - (current_learning_rate*layer.dweights(i,j));
+	        			}
+	        		}
+	        		layer.weight_momentums = weight_updates;
+
+	        		// Build bias update
+	        		bias_updates.resize(layer.biases.size());  // can be optimized out later
+	        		for (uint64_t i = 0; i < layer.biases.size(); ++i){
+	        			bias_updates[i] = (momentum*layer.bias_momentums[i]) - (current_learning_rate*layer.dbiases[i]);
+	        		}
+	        		layer.bias_momentums = bias_updates;
+	        	}else{
+	        		weight_updates.resize(layer.weights.rows(),layer.weights.cols()); // can be optimized out later
+		        	// weights -= lr * dweights
+		        	for (uint64_t i = 0; i < layer.weights.rows(); ++i){
+		        		for (uint64_t j = 0; j < layer.weights.cols(); ++j){
+		        			weight_updates(i,j) -= current_learning_rate*layer.dweights(i,j);
+		        		}
+		        	}
+		        	bias_updates.resize(layer.biases.size());  // can be optimized out later
+		        	// biases -= lr * dbiases
+		        	for (uint64_t i = 0; i < layer.biases.size(); ++i){
+		        		bias_updates[i] -= current_learning_rate * layer.dbiases[i];
+		        	}
+	        	}
+
+	        	for (uint64_t i = 0; i < layer.weights.rows(); ++i){
+	        		for (uint64_t j = 0; j < layer.weights.cols(); ++j){
+	        			layer.weights(i,j) += weight_updates(i,j);
+	        		}
+	        	}
+	        }
+
+	    	void post_update_params(){
+	    		iterations++;
+	    	}
+
+	};
+
+} // end namespace neural_networks