Started on regulaization in Loss.h. I need to refactor the matsum.h since I need a total sum over the matrix. Also matmul needs a elementwise matmul function, which is the next this in the ragulaization
This commit is contained in:
@@ -12,45 +12,51 @@ namespace neural_networks{
|
||||
template <typename T>
|
||||
struct Dense_Layer{
|
||||
|
||||
utils::Matrix<T> _inputs;
|
||||
utils::Matrix<T> weights;
|
||||
utils::Vector<T> biases;
|
||||
utils::Matrix<T> outputs;
|
||||
T weight_regularizer_l1 = {1e-4};
|
||||
T weight_regularizer_l2 = {1e-4};
|
||||
|
||||
utils::Matrix<T> dweights;
|
||||
utils::Vector<T> dbiases;
|
||||
utils::Matrix<T> dinputs;
|
||||
T bias_regularizer_l1 = {1e-4};
|
||||
T bias_regularizer_l2 = {1e-4};
|
||||
|
||||
// Variables for optimizers
|
||||
utils::Matrix<T> weight_momentums;
|
||||
utils::Vector<T> bias_momentums;
|
||||
utils::Matrix<T> weight_cache;
|
||||
utils::Vector<T> bias_cache;
|
||||
|
||||
// Default Constructor
|
||||
Dense_Layer() = default;
|
||||
utils::Matrix<T> _inputs;
|
||||
utils::Matrix<T> weights;
|
||||
utils::Vector<T> biases;
|
||||
utils::Matrix<T> outputs;
|
||||
|
||||
// Constructor
|
||||
Dense_Layer(const uint64_t n_inputs, const uint64_t n_neurons){
|
||||
|
||||
weights.random(n_inputs, n_neurons, -1, 1);
|
||||
biases.resize(n_neurons, T{0});
|
||||
|
||||
}
|
||||
utils::Matrix<T> dweights;
|
||||
utils::Vector<T> dbiases;
|
||||
utils::Matrix<T> dinputs;
|
||||
|
||||
void forward(const utils::Matrix<T>& inputs){
|
||||
_inputs = inputs;
|
||||
outputs = numerics::matadd(numerics::matmul_auto(inputs, weights), biases, "row");
|
||||
}
|
||||
// Variables for optimizers
|
||||
utils::Matrix<T> weight_momentums;
|
||||
utils::Vector<T> bias_momentums;
|
||||
utils::Matrix<T> weight_cache;
|
||||
utils::Vector<T> bias_cache;
|
||||
|
||||
// Default Constructor
|
||||
Dense_Layer() = default;
|
||||
|
||||
void backward(const utils::Matrix<T>& dvalues){
|
||||
// Gradients on parameters
|
||||
dweights = numerics::matmul(numerics::transpose(_inputs), dvalues);
|
||||
dbiases = numerics::matsum(dvalues, "row");
|
||||
//Gradient on values
|
||||
dinputs = numerics::matmul(dvalues, numerics::transpose(weights));
|
||||
// Constructor
|
||||
Dense_Layer(const uint64_t n_inputs, const uint64_t n_neurons){
|
||||
|
||||
weights.random(n_inputs, n_neurons, -1, 1);
|
||||
biases.resize(n_neurons, T{0});
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
void forward(const utils::Matrix<T>& inputs){
|
||||
_inputs = inputs;
|
||||
outputs = numerics::matadd(numerics::matmul_auto(inputs, weights), biases, "row");
|
||||
}
|
||||
|
||||
void backward(const utils::Matrix<T>& dvalues){
|
||||
// Gradients on parameters
|
||||
dweights = numerics::matmul(numerics::transpose(_inputs), dvalues);
|
||||
dbiases = numerics::matsum(dvalues, "row");
|
||||
//Gradient on values
|
||||
dinputs = numerics::matmul(dvalues, numerics::transpose(weights));
|
||||
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
@@ -5,30 +5,65 @@
|
||||
#include "./utils/vector.h"
|
||||
#include "./utils/matrix.h"
|
||||
|
||||
#include "./numerics/vecmean.h"
|
||||
#include "numerics/vecmean.h"
|
||||
#include "numerics/matabs.h"
|
||||
#include "numerics/matmean.h"
|
||||
|
||||
namespace neural_networks{
|
||||
|
||||
template <typename Td, typename Ti>
|
||||
struct Loss{
|
||||
|
||||
utils::Vector<Td> sample_losses;
|
||||
utils::Matrix<Td> dinputs;
|
||||
Td data_loss;
|
||||
utils::Vector<Td> sample_losses;
|
||||
utils::Matrix<Td> dinputs;
|
||||
Td data_loss;
|
||||
Td regularization_losss;
|
||||
|
||||
virtual utils::Vector<Td> forward(const utils::Matrix<Td>& output, const utils::Matrix<Ti>& y) = 0;
|
||||
virtual void backward(const utils::Matrix<Td>& dvalues, const utils::Matrix<Ti>& y) = 0;
|
||||
virtual utils::Vector<Td> forward(const utils::Matrix<Td>& output, const utils::Matrix<Ti>& y) = 0;
|
||||
virtual void backward(const utils::Matrix<Td>& dvalues, const utils::Matrix<Ti>& y) = 0;
|
||||
|
||||
Td calculate(const utils::Matrix<Td>& output, const utils::Matrix<Ti>& y){
|
||||
|
||||
// Calculate sample losses
|
||||
sample_losses = forward(output, y);
|
||||
Td calculate(const utils::Matrix<Td>& output, const utils::Matrix<Ti>& y){
|
||||
|
||||
// Calculate sample losses
|
||||
sample_losses = forward(output, y);
|
||||
|
||||
// Calculate mean loss
|
||||
data_loss = numerics::vecmean(sample_losses);
|
||||
return data_loss;
|
||||
// Calculate mean loss
|
||||
data_loss = numerics::vecmean(sample_losses);
|
||||
|
||||
return data_loss;
|
||||
|
||||
}
|
||||
|
||||
template <typename Layer>
|
||||
Td regularization_loss(const Layer& layer){
|
||||
// 0 by default
|
||||
regularization_losss = 0;
|
||||
|
||||
// L1 regularization - weights
|
||||
// calculate only when factor greater than 0
|
||||
if (layer.weight_regularizer_l1){
|
||||
regularization_losss += layer.weight_regularizer_l1 * numerics::matsum_coeff(numerics::matabs(layer.weights));
|
||||
}
|
||||
|
||||
// L2 regularization - weights
|
||||
if (layer.weight_regularizer_l2){
|
||||
regularization_losss += layer.weight_regularizer_l2 * numerics::matsum_coeff(numerics::matmul(layer.weights,layer.weights)); // elementwise!
|
||||
}
|
||||
|
||||
// L1 regularization - biases
|
||||
// calculate only when factor greater than 0
|
||||
if (layer.bias_regularizer_l1){
|
||||
regularization_losss += layer.bias_regularizer_l1 * layer.biases.abs().sum();
|
||||
}
|
||||
// L2 regularization - biases
|
||||
if (layer.bias_regularizer_l2){
|
||||
regularization_losss += layer.bias_regularizer_l2 * layer.biases.multiply(layer.biases).sum();
|
||||
}
|
||||
|
||||
return regularization_losss;
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
} // end namespace neural_networks
|
||||
@@ -19,3 +19,5 @@
|
||||
|
||||
#include "optimizers/Optimizer_SGD.h"
|
||||
#include "optimizers/Optimizer_Adagrad.h"
|
||||
#include "optimizers/Optimizer_RMSprop.h"
|
||||
#include "optimizers/Optimizer_Adam.h"
|
||||
@@ -0,0 +1,134 @@
|
||||
#pragma once
|
||||
|
||||
#include "./core/omp_config.h"
|
||||
|
||||
#include "./utils/vector.h"
|
||||
#include "./utils/matrix.h"
|
||||
|
||||
#include "./numerics/matmul.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
|
||||
|
||||
namespace neural_networks{
|
||||
|
||||
template <typename T>
|
||||
struct Optimizer_Adam{
|
||||
|
||||
T learning_rate = T{1};
|
||||
T current_learning_rate = learning_rate;
|
||||
T decay = T{0};
|
||||
T epsilon = T{1e-7};
|
||||
T beta_1 = T{0.9};
|
||||
T beta_2 = T{0.999};
|
||||
uint64_t iterations = 0;
|
||||
|
||||
utils::Matrix<T> weight_momentums_corrected;
|
||||
utils::Vector<T> bias_momentums_corrected;
|
||||
utils::Matrix<T> weight_cache_corrected;
|
||||
utils::Vector<T> bias_cache_corrected;
|
||||
|
||||
// Default Constructor
|
||||
Optimizer_Adam() = default;
|
||||
|
||||
// Constructor
|
||||
explicit Optimizer_Adam(const T lr, const T lr_decay, const T epsilons, const T beta1, const T beta2):
|
||||
learning_rate(lr),
|
||||
current_learning_rate{lr},
|
||||
decay(lr_decay),
|
||||
epsilon(epsilons),
|
||||
beta_1(beta1),
|
||||
beta_2(beta2) {}
|
||||
|
||||
void pre_update_params(){
|
||||
if(decay){
|
||||
current_learning_rate = learning_rate * (T{1}/(T{1}+(decay*iterations)));
|
||||
//std::cout << current_learning_rate << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Layer>
|
||||
void update_params(Layer& layer){
|
||||
|
||||
// if layer does not contain cache arrays, create them filled with zeros.
|
||||
if ((layer.weight_cache.rows() != layer.weights.rows()) || (layer.weight_cache.cols() != layer.weights.cols())){
|
||||
layer.weight_momentums.resize(layer.weights.rows(), layer.weights.cols(), T{0});
|
||||
layer.weight_cache.resize(layer.weights.rows(), layer.weights.cols(), T{0});
|
||||
}
|
||||
if (layer.bias_cache.size() != layer.biases.size()){
|
||||
layer.bias_momentums.resize(layer.biases.size(), T{0});
|
||||
layer.bias_cache.resize(layer.biases.size(), T{0});
|
||||
}
|
||||
|
||||
// Update momentum with current gradients
|
||||
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
|
||||
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
|
||||
layer.weight_momentums(i,j) = (beta_1 * layer.weight_momentums(i,j)) + ((T{1} - beta_1) * layer.dweights(i,j));
|
||||
}
|
||||
}
|
||||
|
||||
for (uint64_t i = 0; i < layer.biases.size(); ++i){
|
||||
layer.bias_momentums[i] = (beta_1 * layer.bias_momentums[i]) + ((T{1} - beta_1) * layer.dbiases[i]);
|
||||
}
|
||||
|
||||
|
||||
// Get corrected momentum
|
||||
// interation is 0 at first pass
|
||||
// and we need to start with 1 here
|
||||
weight_momentums_corrected.resize(layer.weights.rows(),layer.weights.cols()); // can be optimized out later
|
||||
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
|
||||
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
|
||||
weight_momentums_corrected(i,j) = layer.weight_momentums(i,j) / (T{1} - std::pow(beta_1, iterations+1));
|
||||
}
|
||||
}
|
||||
bias_momentums_corrected.resize(layer.biases.size()); // can be optimized out later
|
||||
for (uint64_t i = 0; i < layer.biases.size(); ++i){
|
||||
bias_momentums_corrected[i] = layer.bias_momentums[i] / (T{1} - std::pow(beta_1, iterations+1));
|
||||
}
|
||||
|
||||
|
||||
// Update cache with squared current gradients
|
||||
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
|
||||
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
|
||||
layer.weight_cache(i,j) = (beta_2*layer.weight_cache(i,j)) + ((T{1}-beta_2) * (layer.dweights(i,j)*layer.dweights(i,j)));
|
||||
}
|
||||
}
|
||||
|
||||
for (uint64_t i = 0; i < layer.biases.size(); ++i){ // can maybe be included when updating weights (saves time)
|
||||
layer.bias_cache[i] = (beta_2*layer.bias_cache[i]) + ((T{1}-beta_2) * (layer.dbiases[i]*layer.dbiases[i]));
|
||||
}
|
||||
|
||||
// Get corrected cache
|
||||
// interation is 0 at first pass
|
||||
// and we need to start with 1 here
|
||||
weight_cache_corrected.resize(layer.weights.rows(),layer.weights.cols()); // can be optimized out later
|
||||
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
|
||||
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
|
||||
weight_cache_corrected(i,j) = layer.weight_cache(i,j) / (T{1} - std::pow(beta_2, iterations+1));
|
||||
}
|
||||
}
|
||||
bias_cache_corrected.resize(layer.biases.size()); // can be optimized out later
|
||||
for (uint64_t i = 0; i < layer.biases.size(); ++i){
|
||||
bias_cache_corrected[i] = layer.bias_cache[i] / (T{1} - std::pow(beta_2, iterations+1));
|
||||
}
|
||||
|
||||
|
||||
// Vanilla SGD parameter update + normalization with squared rooted cache
|
||||
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
|
||||
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
|
||||
layer.weights(i,j) -= (current_learning_rate*weight_momentums_corrected(i,j)) / (std::sqrt(weight_cache_corrected(i,j)) + epsilon);
|
||||
}
|
||||
}
|
||||
for (uint64_t i = 0; i < layer.biases.size(); ++i){
|
||||
layer.biases[i] -= (current_learning_rate*bias_momentums_corrected[i]) / (std::sqrt(bias_cache_corrected[i]) + epsilon);
|
||||
}
|
||||
}
|
||||
|
||||
void post_update_params(){
|
||||
iterations++;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // end namespace neural_networks
|
||||
@@ -0,0 +1,81 @@
|
||||
#pragma once
|
||||
|
||||
#include "./core/omp_config.h"
|
||||
|
||||
#include "./utils/vector.h"
|
||||
#include "./utils/matrix.h"
|
||||
|
||||
#include "./numerics/matmul.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
|
||||
|
||||
|
||||
namespace neural_networks{
|
||||
|
||||
template <typename T>
|
||||
struct Optimizer_RMSprop{
|
||||
|
||||
T learning_rate = T{1};
|
||||
T current_learning_rate = learning_rate;
|
||||
T decay = T{0};
|
||||
T epsilon = T{1e-7};
|
||||
T rho = T{0.9};
|
||||
uint64_t iterations = 0;
|
||||
|
||||
// Default Constructor
|
||||
Optimizer_RMSprop() = default;
|
||||
|
||||
// Constructor
|
||||
explicit Optimizer_RMSprop(const T lr, const T lr_decay, const T epsilons, const T rhos): learning_rate(lr), current_learning_rate{lr}, decay(lr_decay), epsilon(epsilons), rho(rhos) {}
|
||||
|
||||
void pre_update_params(){
|
||||
if(decay){
|
||||
current_learning_rate = learning_rate * (T{1}/(T{1}+(decay*iterations)));
|
||||
//std::cout << current_learning_rate << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Layer>
|
||||
void update_params(Layer& layer){
|
||||
|
||||
|
||||
|
||||
// if layer does not contain cache arrays, create them filled with zeros.
|
||||
if ((layer.weight_cache.rows() != layer.weights.rows()) || (layer.weight_cache.cols() != layer.weights.cols())){
|
||||
layer.weight_cache.resize(layer.weights.rows(), layer.weights.cols(), T{0});
|
||||
}
|
||||
if (layer.bias_cache.size() != layer.biases.size()){
|
||||
layer.bias_cache.resize(layer.biases.size(), T{0});
|
||||
}
|
||||
|
||||
// Update cache with squared current gradients
|
||||
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
|
||||
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
|
||||
layer.weight_cache(i,j) = (rho*layer.weight_cache(i,j)) + ((T{1}-rho) * (layer.dweights(i,j)*layer.dweights(i,j)));
|
||||
}
|
||||
}
|
||||
|
||||
for (uint64_t i = 0; i < layer.biases.size(); ++i){ // can maybe be included when updating weights (saves time)
|
||||
layer.bias_cache[i] = (rho*layer.bias_cache[i]) + ((T{1}-rho) * (layer.dbiases[i]*layer.dbiases[i]));
|
||||
}
|
||||
|
||||
// Vanilla SGD parameter update + normalization with squared rooted cache
|
||||
for (uint64_t i = 0; i < layer.weights.rows(); ++i){
|
||||
for (uint64_t j = 0; j < layer.weights.cols(); ++j){
|
||||
layer.weights(i,j) -= (current_learning_rate*layer.dweights(i,j)) / (std::sqrt(layer.weight_cache(i,j)) + epsilon);
|
||||
}
|
||||
}
|
||||
for (uint64_t i = 0; i < layer.biases.size(); ++i){
|
||||
layer.biases[i] -= (current_learning_rate*layer.dbiases[i]) / (std::sqrt(layer.bias_cache[i]) + epsilon);
|
||||
}
|
||||
}
|
||||
|
||||
void post_update_params(){
|
||||
iterations++;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // end namespace neural_networks
|
||||
@@ -0,0 +1,29 @@
|
||||
#pragma once
|
||||
|
||||
#include "./numerics/abs.h"
|
||||
#include "./utils/matrix.h"
|
||||
|
||||
namespace numerics{
|
||||
|
||||
template <typename T>
|
||||
void inplace_matabs(utils::Matrix<T>& A){
|
||||
|
||||
for (uint64_t i = 0; i < A.rows(); ++i){
|
||||
for (uint64_t j = 0; j < A.cols(); ++j){
|
||||
A(i,j) = numerics::abs(A(i,j));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
utils::Matrix<T> matabs(const utils::Matrix<T>& A){
|
||||
utils::Matrix<T> B = A;
|
||||
inplace_matabs(B);
|
||||
return B;
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // namespace numerics
|
||||
|
||||
+10
-13
@@ -1,5 +1,4 @@
|
||||
#ifndef _mean_n_
|
||||
#define _mean_n_
|
||||
#pragma once
|
||||
|
||||
#include "./utils/vector.h"
|
||||
#include "./utils/matrix.h"
|
||||
@@ -8,7 +7,7 @@
|
||||
namespace numerics{
|
||||
|
||||
template <typename T>
|
||||
T matmean(utils::Matrix<T>& A) {
|
||||
T matmean(const utils::Matrix<T>& A) {
|
||||
|
||||
T mean(T{0});
|
||||
|
||||
@@ -27,7 +26,7 @@ namespace numerics{
|
||||
|
||||
|
||||
template <typename T>
|
||||
void inplace_matmean_row(utils::Matrix<T>& A, utils::Vector<T>& b) {
|
||||
void inplace_matmean_row(const utils::Matrix<T>& A, utils::Vector<T>& b) {
|
||||
|
||||
const uint64_t rows = A.rows();
|
||||
const uint64_t cols = A.cols();
|
||||
@@ -40,12 +39,12 @@ namespace numerics{
|
||||
for (uint64_t i = 0; i < rows; ++i){
|
||||
b[j] += A(i, j);
|
||||
}
|
||||
b[j] =/ static_cast<T>(rows);
|
||||
b[j] /= static_cast<T>(rows);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void inplace_matmean_cols(utils::Matrix<T>& A) {
|
||||
void inplace_matmean_cols(const utils::Matrix<T>& A, utils::Vector<T>& b) {
|
||||
|
||||
const uint64_t rows = A.rows();
|
||||
const uint64_t cols = A.cols();
|
||||
@@ -58,15 +57,15 @@ namespace numerics{
|
||||
for (uint64_t j = 0; j < cols; ++j){
|
||||
b[i] += A(i, j);
|
||||
}
|
||||
b[j] =/ static_cast<T>(cols);
|
||||
b[i] /= static_cast<T>(cols);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
utils::Vector<T> matmean_row(utils::Matrix<T>& A) {
|
||||
utils::Vector<T> matmean_row(const utils::Matrix<T>& A) {
|
||||
|
||||
utils:Vector<T> b(A.rows(), T{0});
|
||||
utils::Vector<T> b(A.rows(), T{0});
|
||||
|
||||
inplace_matmean_row(A, b);
|
||||
|
||||
@@ -74,9 +73,9 @@ namespace numerics{
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
utils::Vector<T> matmean_col(utils::Matrix<T>& A) {
|
||||
utils::Vector<T> matmean_col(const utils::Matrix<T>& A) {
|
||||
|
||||
utils:Vector<T> b(A.cols(), T{0});
|
||||
utils::Vector<T> b(A.cols(), T{0});
|
||||
|
||||
inplace_matmean_cols(A, b);
|
||||
|
||||
@@ -84,5 +83,3 @@ namespace numerics{
|
||||
}
|
||||
|
||||
} // namespace numerics
|
||||
|
||||
#endif // _mean_n_
|
||||
@@ -7,6 +7,20 @@
|
||||
|
||||
namespace numerics{
|
||||
|
||||
template <typename T>
|
||||
T matsum_coeff(const utils::Matrix<T>& A) {
|
||||
|
||||
T b;
|
||||
|
||||
for (uint64_t i = 0; i < A.cols(); ++i){
|
||||
for (uint64_t j = 0; j < A.rows(); ++j){
|
||||
b += A(i, j);
|
||||
}
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
utils::Vector<T> matsum(const utils::Matrix<T>& A, std::string method) {
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "./numerics/matmul.h"
|
||||
#include "./numerics/matscalar.h"
|
||||
#include "./numerics/matmax.h"
|
||||
#include "./numerics/matabs.h"
|
||||
#include "./numerics/matdiv.h"
|
||||
#include "./numerics/matvec.h"
|
||||
#include "./numerics/matadd.h"
|
||||
|
||||
@@ -400,12 +400,29 @@ public:
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
//############################################
|
||||
//# VECTOR: Abs #
|
||||
//############################################
|
||||
Vector<T> abs()const{
|
||||
Vector<T> result = *this;
|
||||
const uint64_t n = v.size();
|
||||
for (uint64_t i = 0; i < n; ++i){
|
||||
if (v[i]<T{0}){
|
||||
result[i] = -v[i];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
//############################################
|
||||
//# VECTOR: Norm #
|
||||
//############################################
|
||||
T norm() const{
|
||||
return static_cast<T>(std::sqrt(this->dot(*this)));
|
||||
}
|
||||
|
||||
//############################################
|
||||
//# VECTOR: Normalize #
|
||||
//############################################
|
||||
|
||||
Reference in New Issue
Block a user