Optimizers - Part 1
Sync public mirror / sync (push) Failing after 24s

Done with SGD and Adagrad, still need to optimize them but they work.
This commit is contained in:
2026-01-01 19:23:48 +01:00
parent bd2edea8ef
commit e5f8c91be4
12 changed files with 299 additions and 133 deletions
-1
View File
@@ -1 +0,0 @@
empty
-1
View File
@@ -1 +0,0 @@
2
Binary file not shown.
@@ -61,6 +61,8 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o
/home/newton/Documents/Git/Flux/include/modules/neural_networks/datasets/vertical.h /home/newton/Documents/Git/Flux/include/modules/neural_networks/datasets/vertical.h
/home/newton/Documents/Git/Flux/include/modules/neural_networks/layers/Dense_Layer.h /home/newton/Documents/Git/Flux/include/modules/neural_networks/layers/Dense_Layer.h
/home/newton/Documents/Git/Flux/include/modules/neural_networks/neural_networks.h /home/newton/Documents/Git/Flux/include/modules/neural_networks/neural_networks.h
/home/newton/Documents/Git/Flux/include/modules/neural_networks/optimizers/Optimizer_Adagrad.h
/home/newton/Documents/Git/Flux/include/modules/neural_networks/optimizers/Optimizer_SGD.h
/home/newton/Documents/Git/Flux/include/numerics/numerics.h /home/newton/Documents/Git/Flux/include/numerics/numerics.h
/home/newton/Documents/Git/Flux/include/utils/matrix.h /home/newton/Documents/Git/Flux/include/utils/matrix.h
/home/newton/Documents/Git/Flux/include/utils/utils.h /home/newton/Documents/Git/Flux/include/utils/utils.h
@@ -160,6 +162,7 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o
/usr/include/c++/13/istream /usr/include/c++/13/istream
/usr/include/c++/13/limits /usr/include/c++/13/limits
/usr/include/c++/13/locale /usr/include/c++/13/locale
/usr/include/c++/13/math.h
/usr/include/c++/13/new /usr/include/c++/13/new
/usr/include/c++/13/numeric /usr/include/c++/13/numeric
/usr/include/c++/13/ostream /usr/include/c++/13/ostream
@@ -60,6 +60,8 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: /h
/home/newton/Documents/Git/Flux/include/modules/neural_networks/datasets/vertical.h \ /home/newton/Documents/Git/Flux/include/modules/neural_networks/datasets/vertical.h \
/home/newton/Documents/Git/Flux/include/modules/neural_networks/layers/Dense_Layer.h \ /home/newton/Documents/Git/Flux/include/modules/neural_networks/layers/Dense_Layer.h \
/home/newton/Documents/Git/Flux/include/modules/neural_networks/neural_networks.h \ /home/newton/Documents/Git/Flux/include/modules/neural_networks/neural_networks.h \
/home/newton/Documents/Git/Flux/include/modules/neural_networks/optimizers/Optimizer_Adagrad.h \
/home/newton/Documents/Git/Flux/include/modules/neural_networks/optimizers/Optimizer_SGD.h \
/home/newton/Documents/Git/Flux/include/numerics/numerics.h \ /home/newton/Documents/Git/Flux/include/numerics/numerics.h \
/home/newton/Documents/Git/Flux/include/utils/matrix.h \ /home/newton/Documents/Git/Flux/include/utils/matrix.h \
/home/newton/Documents/Git/Flux/include/utils/utils.h \ /home/newton/Documents/Git/Flux/include/utils/utils.h \
@@ -159,6 +161,7 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: /h
/usr/include/c++/13/istream \ /usr/include/c++/13/istream \
/usr/include/c++/13/limits \ /usr/include/c++/13/limits \
/usr/include/c++/13/locale \ /usr/include/c++/13/locale \
/usr/include/c++/13/math.h \
/usr/include/c++/13/new \ /usr/include/c++/13/new \
/usr/include/c++/13/numeric \ /usr/include/c++/13/numeric \
/usr/include/c++/13/ostream \ /usr/include/c++/13/ostream \
@@ -322,12 +325,22 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: /h
/usr/include/x86_64-linux-gnu/c++/13/bits/c++allocator.h: /usr/include/x86_64-linux-gnu/c++/13/bits/c++allocator.h:
/usr/include/x86_64-linux-gnu/c++/13/bits/atomic_word.h:
/usr/include/x86_64-linux-gnu/bits/wordsize.h:
/usr/include/x86_64-linux-gnu/bits/wctype-wchar.h:
/usr/include/x86_64-linux-gnu/bits/wchar.h:
/usr/include/c++/13/bits/functexcept.h: /usr/include/c++/13/bits/functexcept.h:
/usr/include/x86_64-linux-gnu/bits/libc-header-start.h: /usr/include/x86_64-linux-gnu/bits/libc-header-start.h:
/usr/include/c++/13/bits/stl_construct.h: /usr/include/c++/13/bits/stl_construct.h:
/home/newton/Documents/Git/Flux/include/modules/neural_networks/optimizers/Optimizer_SGD.h:
/home/newton/Documents/Git/Flux/include/decomp/decomp.h: /home/newton/Documents/Git/Flux/include/decomp/decomp.h:
/usr/include/c++/13/bits/stl_bvector.h: /usr/include/c++/13/bits/stl_bvector.h:
@@ -384,10 +397,6 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: /h
/usr/include/c++/13/bits/locale_facets.h: /usr/include/c++/13/bits/locale_facets.h:
/usr/include/c++/13/bits/functional_hash.h:
/usr/include/errno.h:
/usr/include/c++/13/bits/exception_ptr.h: /usr/include/c++/13/bits/exception_ptr.h:
/usr/include/c++/13/bits/locale_conv.h: /usr/include/c++/13/bits/locale_conv.h:
@@ -420,8 +429,6 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: /h
/usr/include/c++/13/bits/basic_string.tcc: /usr/include/c++/13/bits/basic_string.tcc:
/usr/include/c++/13/bits/stringfwd.h:
/usr/include/c++/13/backward/binders.h: /usr/include/c++/13/backward/binders.h:
/usr/include/alloca.h: /usr/include/alloca.h:
@@ -432,6 +439,10 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: /h
/usr/include/wchar.h: /usr/include/wchar.h:
/usr/include/c++/13/bits/functional_hash.h:
/usr/include/errno.h:
/usr/include/c++/13/bits/ios_base.h: /usr/include/c++/13/bits/ios_base.h:
/home/newton/Documents/Git/Flux/include/numerics/veclog.h: /home/newton/Documents/Git/Flux/include/numerics/veclog.h:
@@ -446,6 +457,8 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: /h
/usr/include/x86_64-linux-gnu/bits/uintn-identity.h: /usr/include/x86_64-linux-gnu/bits/uintn-identity.h:
/home/newton/Documents/Git/Flux/include/modules/neural_networks/optimizers/Optimizer_Adagrad.h:
/usr/include/c++/13/bits/cxxabi_init_exception.h: /usr/include/c++/13/bits/cxxabi_init_exception.h:
/usr/include/c++/13/typeinfo: /usr/include/c++/13/typeinfo:
@@ -486,10 +499,6 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: /h
/usr/include/x86_64-linux-gnu/bits/struct_rwlock.h: /usr/include/x86_64-linux-gnu/bits/struct_rwlock.h:
/usr/include/c++/13/bits/uniform_int_dist.h:
/usr/include/c++/13/bits/locale_classes.tcc:
/home/newton/Documents/Git/Flux/include/numerics/inverse.h: /home/newton/Documents/Git/Flux/include/numerics/inverse.h:
/usr/include/c++/13/bits/stl_numeric.h: /usr/include/c++/13/bits/stl_numeric.h:
@@ -502,10 +511,6 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: /h
/home/newton/Documents/Git/Flux/include/utils/random.h: /home/newton/Documents/Git/Flux/include/utils/random.h:
/usr/include/x86_64-linux-gnu/c++/13/bits/atomic_word.h:
/usr/include/x86_64-linux-gnu/bits/wordsize.h:
/home/newton/Documents/Git/Flux/include/numerics/matsubtract.h: /home/newton/Documents/Git/Flux/include/numerics/matsubtract.h:
/home/newton/Documents/Git/Flux/include/numerics/matdot.h: /home/newton/Documents/Git/Flux/include/numerics/matdot.h:
@@ -544,6 +549,8 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: /h
/usr/include/c++/13/bits/ostream_insert.h: /usr/include/c++/13/bits/ostream_insert.h:
/usr/include/c++/13/math.h:
/home/newton/Documents/Git/Flux/include/numerics/max.h: /home/newton/Documents/Git/Flux/include/numerics/max.h:
/usr/include/x86_64-linux-gnu/bits/pthread_stack_min-dynamic.h: /usr/include/x86_64-linux-gnu/bits/pthread_stack_min-dynamic.h:
@@ -560,16 +567,6 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: /h
/usr/include/c++/13/bits/codecvt.h: /usr/include/c++/13/bits/codecvt.h:
/usr/include/x86_64-linux-gnu/bits/flt-eval-method.h:
/home/newton/Documents/Git/Flux/include/modules/neural_networks/activation_functions/Activation_Softmax_Loss_CategoricalCrossentropy.h:
/usr/include/c++/13/bits/streambuf_iterator.h:
/usr/include/x86_64-linux-gnu/c++/13/bits/opt_random.h:
/usr/include/c++/13/ostream:
/usr/include/c++/13/bits/cpp_type_traits.h: /usr/include/c++/13/bits/cpp_type_traits.h:
/home/newton/Documents/Git/Flux/include/modules/neural_networks/layers/Dense_Layer.h: /home/newton/Documents/Git/Flux/include/modules/neural_networks/layers/Dense_Layer.h:
@@ -630,6 +627,22 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: /h
/home/newton/Documents/Git/Flux/include/modules/neural_networks/datasets/vertical.h: /home/newton/Documents/Git/Flux/include/modules/neural_networks/datasets/vertical.h:
/usr/include/x86_64-linux-gnu/bits/flt-eval-method.h:
/home/newton/Documents/Git/Flux/include/modules/neural_networks/activation_functions/Activation_Softmax_Loss_CategoricalCrossentropy.h:
/usr/include/c++/13/bits/streambuf_iterator.h:
/usr/include/x86_64-linux-gnu/c++/13/bits/opt_random.h:
/usr/include/c++/13/ostream:
/usr/include/c++/13/bits/stringfwd.h:
/usr/include/c++/13/bits/locale_classes.tcc:
/usr/include/c++/13/bits/uniform_int_dist.h:
/usr/include/c++/13/bits/vector.tcc: /usr/include/c++/13/bits/vector.tcc:
/usr/include/c++/13/cctype: /usr/include/c++/13/cctype:
@@ -816,12 +829,8 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: /h
/usr/include/x86_64-linux-gnu/bits/types/cookie_io_functions_t.h: /usr/include/x86_64-linux-gnu/bits/types/cookie_io_functions_t.h:
/usr/include/x86_64-linux-gnu/bits/wchar.h:
/usr/include/x86_64-linux-gnu/bits/types/locale_t.h: /usr/include/x86_64-linux-gnu/bits/types/locale_t.h:
/usr/include/x86_64-linux-gnu/bits/wctype-wchar.h:
/home/newton/Documents/Git/Flux/include/modules/neural_networks/loss/Loss_CategoricalCrossentrophy.h: /home/newton/Documents/Git/Flux/include/modules/neural_networks/loss/Loss_CategoricalCrossentrophy.h:
/usr/include/x86_64-linux-gnu/bits/types/clockid_t.h: /usr/include/x86_64-linux-gnu/bits/types/clockid_t.h:
@@ -238,4 +238,7 @@ examples/dense-neural-network/CMakeFiles/dense-neural-network.dir/main.cpp.o: \
/home/newton/Documents/Git/Flux/include/modules/neural_networks/activation_functions/Activation_Softmax_Loss_CategoricalCrossentropy.h \ /home/newton/Documents/Git/Flux/include/modules/neural_networks/activation_functions/Activation_Softmax_Loss_CategoricalCrossentropy.h \
/home/newton/Documents/Git/Flux/include/./modules/neural_networks/loss/Loss_CategoricalCrossentrophy.h \ /home/newton/Documents/Git/Flux/include/./modules/neural_networks/loss/Loss_CategoricalCrossentrophy.h \
/home/newton/Documents/Git/Flux/include/./modules/neural_networks/loss/./Loss.h \ /home/newton/Documents/Git/Flux/include/./modules/neural_networks/loss/./Loss.h \
/home/newton/Documents/Git/Flux/include/./numerics/vecmean.h /home/newton/Documents/Git/Flux/include/./numerics/vecmean.h \
/home/newton/Documents/Git/Flux/include/modules/neural_networks/optimizers/Optimizer_SGD.h \
/home/newton/Documents/Git/Flux/include/modules/neural_networks/optimizers/Optimizer_Adagrad.h \
/usr/include/c++/13/math.h
+67 -102
View File
@@ -19,131 +19,96 @@
int main(int argc, char const *argv[]) int main(int argc, char const *argv[])
{ {
utils::Mf X(10,2, 0); uint64_t number_of_classes = 5;
utils::Matrix<int64_t> y(10,1, 0); uint64_t number_of_samples = 100;
uint64_t number_of_epochs = 100;
utils::Mf X;
utils::Matrix<int64_t> y;
utils::Vector<int64_t> class_targets; utils::Vector<int64_t> class_targets;
float loss; float loss;
float accuracy; float accuracy;
//neural_networks::create_spital_data<float, uint64_t>(10000, 3, X, y);
neural_networks::create_vertical_data<float, int64_t>(100, 3, X, y);
neural_networks::Dense_Layer<float> dense1(2, 3);
neural_networks::Activation_ReLU<float> activation1;
neural_networks::Dense_Layer<float> dense2(3, 3);
neural_networks::Activation_Softmax<float> activation2;
neural_networks::Loss_CategoricalCrossentrophy<float, int64_t> loss_funtion;
float lowest_loss = 9999999;
utils::Mf best_dense_1_weights = dense1.weights;
utils::Vf best_dense_1_biases = dense1.biases;
utils::Mf best_dense_2_weights = dense2.weights;
utils::Vf best_dense_2_biases = dense2.biases;
utils::Vf vectRND;
utils::Vector<int64_t> predections; utils::Vector<int64_t> predections;
// Create dataset
neural_networks::create_spital_data<float, int64_t>(number_of_samples, number_of_classes, X, y);
//neural_networks::create_vertical_data<float, int64_t>(number_of_samples, number_of_classes, X, y);
// Create Dense layer with 2 input featues and 3 output values
neural_networks::Dense_Layer<float> dense1(2, 64);
// Create ReLU activation (to be used with Dense layer)
neural_networks::Activation_ReLU<float> activation1;
// Create a second Dense layer with 3 inputs (as we take the vlaues from the last layer)
// and 3 output values
neural_networks::Dense_Layer<float> dense2(64, number_of_classes);
// Create a Sfotmax classifier's combined loss and activation
neural_networks::Activation_Softmax_Loss_CategoricalCrossentropy<float, int64_t> loss_activation;
// Create optimizer
//neural_networks::Optimizer_SGD<float> optimizer(1, 1e-3, 0.5);
neural_networks::Optimizer_Adagrad<float> optimizer(1, 1e-3, 1e-6);
// Train in loop
for (uint64_t epoch = 0; epoch < number_of_epochs+1; ++epoch){
for (uint64_t i = 0; i < 10; ++i){ // Perform a forward pass of our training data through this layer
dense1.forward(X);
// Perform a forward pass thourgh activation function
// takes the output fo the first layer here
activation1.forward(dense1.outputs);
// Generate a new set of weights for iteration // Perform a forward pass through second Dense layer
numerics::inplace_matrandom_mul(dense1.weights,0.98f, 1.02f); // takes output of activation function of the first layer as input
numerics::inplace_vecrandom_mul(dense1.biases,0.98f, 1.02f); dense2.forward(activation1.outputs);
numerics::inplace_matrandom_mul(dense2.weights,0.98f, 1.02f); // Perform a foard pass through the activation/loss function
numerics::inplace_vecrandom_mul(dense2.biases,0.98f, 1.02f); // takes the output of the second dense layer here and returns loss
loss = loss_activation.forward(dense2.outputs, y);
// Perform a forward pass of the training data through this layer
dense1.forward(X);
activation1.forward(dense1.outputs);
dense2.forward(activation1.outputs);
activation2.forward(dense2.outputs);
// Perform a farward pass through activation function
// it takes the output of the second dense layer here and returns loss
loss = loss_funtion.calculate(activation2.outputs, y);
predections = numerics::matargmax_row<int64_t, float>(activation2.outputs);
if (y.cols() < 1){
class_targets = numerics::matargmax_row<int64_t, int64_t>(y);
}else{
class_targets = y.get_col(0);
}
accuracy = numerics::vecmean_equal<float>(predections, class_targets);
if (loss < lowest_loss){
//std::cout << "New set of weights found, iteration:" << i << ", loss:" << loss << ", acc:" << accuracy << std::endl;
best_dense_1_weights = dense1.weights;
best_dense_1_biases = dense1.biases;
best_dense_2_weights = dense2.weights;
best_dense_2_biases = dense2.biases;
lowest_loss = loss;
} else{
//std::cout << "HERE" << std::endl;
dense1.weights = best_dense_1_weights;
dense1.biases = best_dense_1_biases;
dense2.weights = best_dense_2_weights;
dense2.biases = best_dense_2_biases;
}
// Calculate accuracy from output of activation2 and targets
predections = numerics::matargmax_row<int64_t, float>(loss_activation.outputs);
if (y.cols() < 1){
class_targets = numerics::matargmax_row<int64_t, int64_t>(y);
}else{
class_targets = y.get_col(0);
} }
//std::cout << loss << std::endl; accuracy = numerics::vecmean_equal<float>(predections, class_targets);
//std::cout << accuracy << std::endl;
utils::Matrix<float> softmax_outputs{{0.7, 0.1, 0.2},
{0.1, 0.5, 0.4},
{0.02, 0.9, 0.08}};
utils::Matrix<int64_t> clas_targets{{0},{1},{1}};
neural_networks::Activation_Softmax_Loss_CategoricalCrossentropy<float, int64_t> softmax_loss;
softmax_loss.backward(softmax_outputs, clas_targets);
utils::Matrix<float> dvalues1 = softmax_loss.dinputs;
neural_networks::Activation_Softmax<float> activation;
activation.outputs = softmax_outputs;
//neural_networks::Loss_CategoricalCrossentrophy<float, int64_t> loss;
dvalues1.print(); if (!(epoch%100)){
std::cout << "epoch: " << epoch;
std::cout << ", acc: " << accuracy;
std::cout << ", loss: " << loss;
std::cout << ", lr: " << optimizer.current_learning_rate;
std::cout << std::endl;
}
// Backward pass
loss_activation.backward(loss_activation.outputs, y);
dense2.backward(loss_activation.dinputs);
activation1.backward(dense2.dinputs);
dense1.backward(activation1.dinputs);
// Update weights and biases
optimizer.pre_update_params();
optimizer.update_params(dense1);
optimizer.update_params(dense2);
optimizer.post_update_params();
}
/*
utils::Vd a = utils::linspace<double>(1, 10, 10, true);
a.print();
mesh::Mesh1D<double> mesh(a);
mesh.generate_vertices(0.5, 10.5);
double Gamma = 1.0;
utils::Md A;
utils::Vd b, s(10,1);
core::Configs<double>& cfg = core::Configs<double>::defaults();
cfg.grid = core::GridKind::Uniform;
cfg.left = {core::FDKind::Forward, core::BCKind::Neumann, 0.0};
cfg.right = {core::FDKind::Backward, core::BCKind::Neumann, 0.0};
cfg.solver = core::SolverKind::LU;
fluids::Diffusion1D<double> diffusion(cfg, mesh, Gamma);
diffusion.assemble(A, b, s);
*/
return 0; return 0;
} }
@@ -21,6 +21,11 @@ namespace neural_networks{
utils::Vector<T> dbiases; utils::Vector<T> dbiases;
utils::Matrix<T> dinputs; utils::Matrix<T> dinputs;
// Variables for optimizers
utils::Matrix<T> weight_momentums;
utils::Vector<T> bias_momentums;
utils::Matrix<T> weight_cache;
utils::Vector<T> bias_cache;
// Default Constructor // Default Constructor
Dense_Layer() = default; Dense_Layer() = default;
@@ -4,6 +4,7 @@
#include "datasets/spiral.h" #include "datasets/spiral.h"
#include "datasets/vertical.h" #include "datasets/vertical.h"
#include "layers/Dense_Layer.h" #include "layers/Dense_Layer.h"
@@ -11,5 +12,10 @@
#include "activation_functions/Activation_Softmax.h" #include "activation_functions/Activation_Softmax.h"
#include "activation_functions/Activation_Softmax_Loss_CategoricalCrossentropy.h" #include "activation_functions/Activation_Softmax_Loss_CategoricalCrossentropy.h"
#include "loss/Loss.h" // Base #include "loss/Loss.h" // Base
#include "loss/Loss_CategoricalCrossentrophy.h" #include "loss/Loss_CategoricalCrossentrophy.h"
#include "optimizers/Optimizer_SGD.h"
#include "optimizers/Optimizer_Adagrad.h"
@@ -0,0 +1,80 @@
#pragma once
#include "./core/omp_config.h"
#include "./utils/vector.h"
#include "./utils/matrix.h"
#include "./numerics/matmul.h"
#include <math.h>
namespace neural_networks{
template <typename T>
struct Optimizer_Adagrad{
T learning_rate = T{1};
T current_learning_rate = learning_rate;
T decay = T{0};
T epsilon = T{1e-7};
uint64_t iterations = 0;
// Default Constructor
Optimizer_Adagrad() = default;
// Constructor
explicit Optimizer_Adagrad(const T lr, const T lr_decay, const T epsilons): learning_rate(lr), current_learning_rate{lr}, decay(lr_decay), epsilon(epsilons) {}
void pre_update_params(){
if(decay){
current_learning_rate = learning_rate * (T{1}/(T{1}+(decay*iterations)));
//std::cout << current_learning_rate << std::endl;
}
}
template <typename Layer>
void update_params(Layer& layer){
// if layer does not contain cache arrays, create them filled with zeros.
if ((layer.weight_cache.rows() != layer.weights.rows()) || (layer.weight_cache.cols() != layer.weights.cols())){
layer.weight_cache.resize(layer.weights.rows(), layer.weights.cols(), T{0});
}
if (layer.bias_cache.size() != layer.biases.size()){
layer.bias_cache.resize(layer.biases.size(), T{0});
}
// Update cache with squared current gradients
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
layer.weight_cache(i,j) = layer.dweights(i,j)*layer.dweights(i,j);
}
}
for (uint64_t i = 0; i < layer.biases.size(); ++i){ // can maybe be included when updating weights (saves time)
layer.bias_cache[i] = layer.dbiases[i]*layer.dbiases[i];
}
// Vanilla SGD parameter update + normalization with squared rooted cache
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
layer.weights(i,j) -= (current_learning_rate*layer.dweights(i,j)) / (std::sqrt(layer.weight_cache(i,j)) + epsilon);
}
}
for (uint64_t i = 0; i < layer.biases.size(); ++i){
layer.biases[i] -= (current_learning_rate*layer.dbiases[i]) / (std::sqrt(layer.bias_cache[i]) + epsilon);
}
}
void post_update_params(){
iterations++;
}
};
} // end namespace neural_networks
@@ -0,0 +1,97 @@
#pragma once
#include "./core/omp_config.h"
#include "./utils/vector.h"
#include "./utils/matrix.h"
#include "./numerics/matmul.h"
namespace neural_networks{
template <typename T>
struct Optimizer_SGD{
T learning_rate = T{1};
T current_learning_rate = learning_rate;
T decay = T{0};
T momentum = T{0};
uint64_t iterations = 0;
utils::Matrix<T> weight_updates;
utils::Vector<T> bias_updates;
// Default Constructor
Optimizer_SGD() = default;
// Constructor
explicit Optimizer_SGD(const T lr, const T lr_decay, const T momentums): learning_rate(lr), current_learning_rate{lr}, decay(lr_decay), momentum(momentums) {}
void pre_update_params(){
if(decay){
current_learning_rate = learning_rate * (T{1}/(T{1}+(decay*iterations)));
//std::cout << current_learning_rate << std::endl;
}
}
template <typename Layer>
void update_params(Layer& layer){
// if we use momentum
if(momentum){
// if layer does not contain momentum arrays, create them filled with zeros.
if ((layer.weight_momentums.rows() != layer.weights.rows()) || (layer.weight_momentums.cols() != layer.weights.cols())){
layer.weight_momentums.resize(layer.weights.rows(), layer.weights.cols(), T{0});
}
if (layer.bias_momentums.size() != layer.biases.size()){
layer.bias_momentums.resize(layer.biases.size(), T{0});
}
// Build weight updates with momentum - take previous updates,
// multiplied by retain factor and update with current gradients
weight_updates.resize(layer.weights.rows(),layer.weights.cols()); // can be optimized out later
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
weight_updates(i,j) = (momentum*layer.weight_momentums(i,j)) - (current_learning_rate*layer.dweights(i,j));
}
}
layer.weight_momentums = weight_updates;
// Build bias update
bias_updates.resize(layer.biases.size()); // can be optimized out later
for (uint64_t i = 0; i < layer.biases.size(); ++i){
bias_updates[i] = (momentum*layer.bias_momentums[i]) - (current_learning_rate*layer.dbiases[i]);
}
layer.bias_momentums = bias_updates;
}else{
weight_updates.resize(layer.weights.rows(),layer.weights.cols()); // can be optimized out later
// weights -= lr * dweights
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
weight_updates(i,j) -= current_learning_rate*layer.dweights(i,j);
}
}
bias_updates.resize(layer.biases.size()); // can be optimized out later
// biases -= lr * dbiases
for (uint64_t i = 0; i < layer.biases.size(); ++i){
bias_updates[i] -= current_learning_rate * layer.dbiases[i];
}
}
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
layer.weights(i,j) += weight_updates(i,j);
}
}
}
void post_update_params(){
iterations++;
}
};
} // end namespace neural_networks