Done with SGD and Adagrad, still need to optimize them but they work.
This commit is contained in:
@@ -0,0 +1,80 @@
|
||||
#pragma once
|
||||
|
||||
#include "./core/omp_config.h"
|
||||
|
||||
#include "./utils/vector.h"
|
||||
#include "./utils/matrix.h"
|
||||
|
||||
#include "./numerics/matmul.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
|
||||
|
||||
|
||||
namespace neural_networks{
|
||||
|
||||
template <typename T>
|
||||
struct Optimizer_Adagrad{
|
||||
|
||||
T learning_rate = T{1};
|
||||
T current_learning_rate = learning_rate;
|
||||
T decay = T{0};
|
||||
T epsilon = T{1e-7};
|
||||
uint64_t iterations = 0;
|
||||
|
||||
// Default Constructor
|
||||
Optimizer_Adagrad() = default;
|
||||
|
||||
// Constructor
|
||||
explicit Optimizer_Adagrad(const T lr, const T lr_decay, const T epsilons): learning_rate(lr), current_learning_rate{lr}, decay(lr_decay), epsilon(epsilons) {}
|
||||
|
||||
void pre_update_params(){
|
||||
if(decay){
|
||||
current_learning_rate = learning_rate * (T{1}/(T{1}+(decay*iterations)));
|
||||
//std::cout << current_learning_rate << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Layer>
|
||||
void update_params(Layer& layer){
|
||||
|
||||
|
||||
|
||||
// if layer does not contain cache arrays, create them filled with zeros.
|
||||
if ((layer.weight_cache.rows() != layer.weights.rows()) || (layer.weight_cache.cols() != layer.weights.cols())){
|
||||
layer.weight_cache.resize(layer.weights.rows(), layer.weights.cols(), T{0});
|
||||
}
|
||||
if (layer.bias_cache.size() != layer.biases.size()){
|
||||
layer.bias_cache.resize(layer.biases.size(), T{0});
|
||||
}
|
||||
|
||||
// Update cache with squared current gradients
|
||||
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
|
||||
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
|
||||
layer.weight_cache(i,j) = layer.dweights(i,j)*layer.dweights(i,j);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint64_t i = 0; i < layer.biases.size(); ++i){ // can maybe be included when updating weights (saves time)
|
||||
layer.bias_cache[i] = layer.dbiases[i]*layer.dbiases[i];
|
||||
}
|
||||
|
||||
// Vanilla SGD parameter update + normalization with squared rooted cache
|
||||
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
|
||||
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
|
||||
layer.weights(i,j) -= (current_learning_rate*layer.dweights(i,j)) / (std::sqrt(layer.weight_cache(i,j)) + epsilon);
|
||||
}
|
||||
}
|
||||
for (uint64_t i = 0; i < layer.biases.size(); ++i){
|
||||
layer.biases[i] -= (current_learning_rate*layer.dbiases[i]) / (std::sqrt(layer.bias_cache[i]) + epsilon);
|
||||
}
|
||||
}
|
||||
|
||||
void post_update_params(){
|
||||
iterations++;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // end namespace neural_networks
|
||||
@@ -0,0 +1,97 @@
|
||||
#pragma once
|
||||
|
||||
#include "./core/omp_config.h"
|
||||
|
||||
#include "./utils/vector.h"
|
||||
#include "./utils/matrix.h"
|
||||
|
||||
#include "./numerics/matmul.h"
|
||||
|
||||
|
||||
|
||||
|
||||
namespace neural_networks{
|
||||
|
||||
template <typename T>
|
||||
struct Optimizer_SGD{
|
||||
|
||||
T learning_rate = T{1};
|
||||
T current_learning_rate = learning_rate;
|
||||
T decay = T{0};
|
||||
T momentum = T{0};
|
||||
uint64_t iterations = 0;
|
||||
|
||||
utils::Matrix<T> weight_updates;
|
||||
utils::Vector<T> bias_updates;
|
||||
|
||||
// Default Constructor
|
||||
Optimizer_SGD() = default;
|
||||
|
||||
// Constructor
|
||||
explicit Optimizer_SGD(const T lr, const T lr_decay, const T momentums): learning_rate(lr), current_learning_rate{lr}, decay(lr_decay), momentum(momentums) {}
|
||||
|
||||
void pre_update_params(){
|
||||
if(decay){
|
||||
current_learning_rate = learning_rate * (T{1}/(T{1}+(decay*iterations)));
|
||||
//std::cout << current_learning_rate << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Layer>
|
||||
void update_params(Layer& layer){
|
||||
|
||||
|
||||
// if we use momentum
|
||||
if(momentum){
|
||||
// if layer does not contain momentum arrays, create them filled with zeros.
|
||||
if ((layer.weight_momentums.rows() != layer.weights.rows()) || (layer.weight_momentums.cols() != layer.weights.cols())){
|
||||
layer.weight_momentums.resize(layer.weights.rows(), layer.weights.cols(), T{0});
|
||||
}
|
||||
if (layer.bias_momentums.size() != layer.biases.size()){
|
||||
layer.bias_momentums.resize(layer.biases.size(), T{0});
|
||||
}
|
||||
// Build weight updates with momentum - take previous updates,
|
||||
// multiplied by retain factor and update with current gradients
|
||||
weight_updates.resize(layer.weights.rows(),layer.weights.cols()); // can be optimized out later
|
||||
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
|
||||
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
|
||||
weight_updates(i,j) = (momentum*layer.weight_momentums(i,j)) - (current_learning_rate*layer.dweights(i,j));
|
||||
}
|
||||
}
|
||||
layer.weight_momentums = weight_updates;
|
||||
|
||||
// Build bias update
|
||||
bias_updates.resize(layer.biases.size()); // can be optimized out later
|
||||
for (uint64_t i = 0; i < layer.biases.size(); ++i){
|
||||
bias_updates[i] = (momentum*layer.bias_momentums[i]) - (current_learning_rate*layer.dbiases[i]);
|
||||
}
|
||||
layer.bias_momentums = bias_updates;
|
||||
}else{
|
||||
weight_updates.resize(layer.weights.rows(),layer.weights.cols()); // can be optimized out later
|
||||
// weights -= lr * dweights
|
||||
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
|
||||
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
|
||||
weight_updates(i,j) -= current_learning_rate*layer.dweights(i,j);
|
||||
}
|
||||
}
|
||||
bias_updates.resize(layer.biases.size()); // can be optimized out later
|
||||
// biases -= lr * dbiases
|
||||
for (uint64_t i = 0; i < layer.biases.size(); ++i){
|
||||
bias_updates[i] -= current_learning_rate * layer.dbiases[i];
|
||||
}
|
||||
}
|
||||
|
||||
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
|
||||
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
|
||||
layer.weights(i,j) += weight_updates(i,j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void post_update_params(){
|
||||
iterations++;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // end namespace neural_networks
|
||||
Reference in New Issue
Block a user