diff --git a/src/brain/modelgen/augmented_nn.py b/src/brain/modelgen/augmented_nn.py new file mode 100644 index 00000000000..a05055a9e14 --- /dev/null +++ b/src/brain/modelgen/augmented_nn.py @@ -0,0 +1,115 @@ +#===----------------------------------------------------------------------===# +# +# Peloton +# +# AugmentedNN.py +# +# Identification: src/brain/modelgen/AugmentedNN.py +# +# Copyright (c) 2015-2018, Carnegie Mellon University Database Group +# +#===----------------------------------------------------------------------===# + +import tensorflow as tf +import functools +import os +import argparse + +def lazy_property(function): + attribute = '_cache_' + function.__name__ + + @property + @functools.wraps(function) + def decorator(self): + if not hasattr(self, attribute): + setattr(self, attribute, function(self)) + return getattr(self, attribute) + + return decorator + +class AugmentedNN: + + def __init__(self, column_num, order=1, neuron_num=16, lr=0.1, **kwargs): + tf.reset_default_graph() + self.data = tf.placeholder(tf.float32, [None, column_num*2], name="data_") + self.target = tf.placeholder(tf.float32, [None, 1], name="target_") + self._column_num = column_num + self._order = order + self._neuron_num = neuron_num + self._lr = tf.placeholder_with_default(lr, shape=None, + name="learn_rate_") + self.tf_init = tf.global_variables_initializer + self.prediction + self.loss + self.optimize + + @staticmethod + def jump_activation(k): + """ + This is an activation function used to learn discontinuous functions. + Reference: https://dl.acm.org/citation.cfm?id=2326898 + """ + def jump_activation_k(x): + return tf.pow(tf.maximum(0.0, 1-tf.exp(-x)), k) + return jump_activation_k + + @lazy_property + def prediction(self): + net = self.data + kernel_init = tf.random_normal_initializer(mean=0.0001, stddev=0.0001) + with tf.name_scope("hidden_layer"): + net_shape = tf.shape(net) + bsz = net_shape[0] + + h1_layers = [] + for i in range(1, self._order+1): + h1 = tf.layers.dense(net, self._neuron_num, + activation=self.jump_activation(i), + kernel_initializer=kernel_init) + h1_layers.append(h1) + h1_layers = tf.concat(h1_layers, 1) + with tf.name_scope("output_layer"): + net = tf.layers.dense(h1_layers, 1, + activation=self.jump_activation(1), + kernel_initializer=kernel_init) + net = tf.reshape(net, [bsz, -1], name="pred_") + return net + + @lazy_property + def loss(self): + loss = tf.reduce_mean(tf.squared_difference(self.target, self.prediction), name='lossOp_') + return loss + + @lazy_property + def optimize(self): + params = tf.trainable_variables() + gradients = tf.gradients(self.loss, params) + optimizer = tf.train.AdagradOptimizer(learning_rate=self._lr) + return optimizer.apply_gradients(zip(gradients, + params), name="optimizeOp_") + + def write_graph(self, dir): + fname = "{}.pb".format(self.__repr__()) + abs_path = os.path.join(dir, fname) + if not os.path.exists(abs_path): + tf.train.write_graph(tf.get_default_graph(), + dir, fname, False) + + def __repr__(self): + return "augmented_nn" + +def main(): + parser = argparse.ArgumentParser(description='AugmentedNN Model Generator') + + parser.add_argument('--column_num', type=int, default=1, help='Number of augmentedNN Hidden units') + parser.add_argument('--order', type=int, default=3, help='Max order of activation function') + parser.add_argument('--neuron_num', type=int, default=20, help='Number of neurons in hidden layer') + parser.add_argument('--lr', type=float, default=0.001, help='Learning rate') + parser.add_argument('graph_out_path', type=str, help='Path to write graph output', nargs='+') + args = parser.parse_args() + model = AugmentedNN(args.column_num, args.order, args.neuron_num, args.lr) + model.tf_init() + model.write_graph(' '.join(args.graph_out_path)) + +if __name__ == '__main__': + main() diff --git a/src/brain/selectivity/augmented_nn.cpp b/src/brain/selectivity/augmented_nn.cpp new file mode 100644 index 00000000000..d469fdcdffa --- /dev/null +++ b/src/brain/selectivity/augmented_nn.cpp @@ -0,0 +1,167 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// augmented_nn.cpp +// +// Identification: src/brain/workload/augmented_nn.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "brain/selectivity/augmented_nn.h" +#include "brain/util/model_util.h" +#include "brain/util/tf_session_entity/tf_session_entity.h" +#include "brain/util/tf_session_entity/tf_session_entity_input.h" +#include "brain/util/tf_session_entity/tf_session_entity_output.h" +#include "util/file_util.h" + +namespace peloton { +namespace brain { + +AugmentedNN::AugmentedNN(int column_num, int order, int neuron_num, + float learn_rate, int batch_size, int epochs) + : BaseTFModel("src/brain/modelgen", "src/brain/modelgen/augmented_nn.py", + "src/brain/modelgen/augmented_nn.pb"), + column_num_(column_num), + order_(order), + neuron_num_(neuron_num), + learn_rate_(learn_rate), + batch_size_(batch_size), + epochs_(epochs) { + GenerateModel(ConstructModelArgsString()); + // Import the Model + tf_session_entity_->ImportGraph(graph_path_); + // Initialize the model + TFInit(); +} + +std::string AugmentedNN::ConstructModelArgsString() const { + std::stringstream args_str_builder; + args_str_builder << " --column_num " << column_num_; + args_str_builder << " --order " << order_; + args_str_builder << " --neuron_num " << neuron_num_; + args_str_builder << " --lr " << learn_rate_; + args_str_builder << " " << this->modelgen_path_; + return args_str_builder.str(); +} + +std::string AugmentedNN::ToString() const { + std::stringstream model_str_builder; + model_str_builder << "augmented_nn("; + model_str_builder << "column_num = " << column_num_; + model_str_builder << ", order = " << order_; + model_str_builder << ", neuron_num = " << neuron_num_; + model_str_builder << ", lr = " << learn_rate_; + model_str_builder << ", batch_size = " << batch_size_; + model_str_builder << ")"; + return model_str_builder.str(); +} + +// returns a batch +void AugmentedNN::GetBatch(const matrix_eig &mat, size_t batch_offset, + size_t bsz, matrix_eig &data, + matrix_eig &target) { + size_t row_idx = batch_offset * bsz; + data = mat.block(row_idx, 0, bsz, mat.cols() - 1); + target = mat.block(row_idx, mat.cols() - 1, bsz, 1); +} + +// backpropagate once +void AugmentedNN::Fit(const matrix_eig &X, const matrix_eig &y, int bsz) { + auto data_batch = EigenUtil::Flatten(X); + auto target_batch = EigenUtil::Flatten(y); + std::vector dims_data{bsz, X.cols()}; + std::vector dims_target{bsz, 1}; + std::vector inputs_optimize{ + new TfFloatIn(data_batch.data(), dims_data, "data_"), + new TfFloatIn(target_batch.data(), dims_target, "target_"), + new TfFloatIn(learn_rate_, "learn_rate_")}; + tf_session_entity_->Eval(inputs_optimize, "optimizeOp_"); + std::for_each(inputs_optimize.begin(), inputs_optimize.end(), TFIO_Delete); +} + +float AugmentedNN::TrainEpoch(const matrix_eig &mat) { + std::vector losses; + // Obtain relevant metadata + int min_allowed_bsz = 1; + int bsz = std::min((int)mat.rows(), std::max(batch_size_, min_allowed_bsz)); + int number_of_batches = mat.rows() / bsz; + int num_cols = mat.cols() - 1; + + std::vector y_batch, y_hat_batch; + // Run through each batch and compute loss/apply backprop + for (int batch_offset = 0; batch_offset < number_of_batches; + ++batch_offset) { + matrix_eig data_batch, target_batch; + GetBatch(mat, batch_offset, bsz, data_batch, target_batch); + + std::vector dims_data{bsz, num_cols}; + std::vector dims_target{bsz, 1}; + + Fit(data_batch, target_batch, bsz); + + matrix_eig y_hat_eig = Predict(data_batch, bsz); + y_hat_batch.push_back(y_hat_eig); + y_batch.push_back(target_batch); + } + matrix_eig y = EigenUtil::VStack(y_batch); + matrix_eig y_hat = EigenUtil::VStack(y_hat_batch); + return ModelUtil::MeanSqError(y, y_hat); + +} + +// x: [bsz, 2] +// return: [bsz, 1] +matrix_eig AugmentedNN::Predict(const matrix_eig &X, int bsz) const { + auto data_batch = EigenUtil::Flatten(X); + std::vector dims_data{bsz, X.cols()}; + std::vector dims_target{bsz, 1}; + + std::vector inputs_predict{ + new TfFloatIn(data_batch.data(), dims_data, "data_")}; + auto output_predict = new TfFloatOut(dims_target, "pred_"); + // Obtain predicted values + auto out = tf_session_entity_->Eval(inputs_predict, output_predict); + + matrix_t y_hat; + for (int res_idx = 0; res_idx < bsz; res_idx++) { + vector_t res = {out[res_idx]}; + y_hat.push_back(res); + } + std::for_each(inputs_predict.begin(), inputs_predict.end(), TFIO_Delete); + TFIO_Delete(output_predict); + return EigenUtil::ToEigenMat(y_hat); +} + +float AugmentedNN::ValidateEpoch(const matrix_eig &mat) { + // Obtain relevant metadata + int min_allowed_bsz = 1; + int bsz = std::min((int)mat.rows(), std::max(batch_size_, min_allowed_bsz)); + int number_of_batches = mat.rows() / bsz; + int num_cols = mat.cols() - 1; + + std::vector y_batch, y_hat_batch; + // Apply Validation + // Run through each batch and compute loss/apply backprop + for (int batch_offset = 0; batch_offset < number_of_batches; + ++batch_offset) { + matrix_eig data_batch, target_batch; + GetBatch(mat, batch_offset, bsz, data_batch, target_batch); + + std::vector dims_data{bsz, num_cols}; + std::vector dims_target{bsz, 1}; + + matrix_eig y_hat_eig = Predict(data_batch, bsz); + y_hat_batch.push_back(y_hat_eig); + y_batch.push_back(target_batch); + } + matrix_eig y = EigenUtil::VStack(y_batch); + matrix_eig y_hat = EigenUtil::VStack(y_hat_batch); + return ModelUtil::MeanSqError(y, y_hat); + +} +} // namespace brain +} // namespace peloton + diff --git a/src/brain/selectivity/selectivity_defaults.cpp b/src/brain/selectivity/selectivity_defaults.cpp new file mode 100644 index 00000000000..c1b8254edc1 --- /dev/null +++ b/src/brain/selectivity/selectivity_defaults.cpp @@ -0,0 +1,27 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// selectivity_defaults.cpp +// +// Identification: src/brain/workload/selectivity_defaults.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "brain/selectivity/selectivity_defaults.h" + +namespace peloton { +namespace brain { + +const int AugmentedNNDefaults::COLUMN_NUM = 1; +const int AugmentedNNDefaults::ORDER = 1; +const int AugmentedNNDefaults::NEURON_NUM = 16; +const float AugmentedNNDefaults::LR = 0.1f; +const int AugmentedNNDefaults::BATCH_SIZE = 256; +const int AugmentedNNDefaults::EPOCHS = 600; + + +} // namespace brain +} // namespace peloton diff --git a/src/brain/workload/workload_defaults.cpp b/src/brain/workload/workload_defaults.cpp index c1537b138da..529299db9a6 100644 --- a/src/brain/workload/workload_defaults.cpp +++ b/src/brain/workload/workload_defaults.cpp @@ -1,40 +1,40 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// workload_defaults.cpp -// -// Identification: src/brain/workload/workload_defaults.cpp -// -// Copyright (c) 2015-2018, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#include "brain/workload/workload_defaults.h" - -namespace peloton { -namespace brain { - -const int CommonWorkloadDefaults::HORIZON = 216; -const int CommonWorkloadDefaults::INTERVAL = 100; -const int CommonWorkloadDefaults::PADDLING_DAYS = 7; -const int CommonWorkloadDefaults::ESTOP_PATIENCE = 10; -const float CommonWorkloadDefaults::ESTOP_DELTA = 0.01f; - -const int LSTMWorkloadDefaults::NFEATS = 3; -const int LSTMWorkloadDefaults::NENCODED = 20; -const int LSTMWorkloadDefaults::NHID = 20; -const int LSTMWorkloadDefaults::NLAYERS = 2; -const float LSTMWorkloadDefaults::LR = 0.01f; -const float LSTMWorkloadDefaults::DROPOUT_RATE = 0.5f; -const float LSTMWorkloadDefaults::CLIP_NORM = 0.5f; -const int LSTMWorkloadDefaults::BATCH_SIZE = 12; -const int LSTMWorkloadDefaults::BPTT = 90; -const int LSTMWorkloadDefaults::EPOCHS = 100; - -const int LinearRegWorkloadDefaults::BPTT = 90; - -const int KernelRegWorkloadDefaults::BPTT = 90; - -} // namespace brain -} // namespace peloton \ No newline at end of file +//===----------------------------------------------------------------------===// +// +// Peloton +// +// workload_defaults.cpp +// +// Identification: src/brain/workload/workload_defaults.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include "brain/workload/workload_defaults.h" + +namespace peloton { +namespace brain { + +const int CommonWorkloadDefaults::HORIZON = 216; +const int CommonWorkloadDefaults::INTERVAL = 100; +const int CommonWorkloadDefaults::PADDLING_DAYS = 7; +const int CommonWorkloadDefaults::ESTOP_PATIENCE = 10; +const float CommonWorkloadDefaults::ESTOP_DELTA = 0.01f; + +const int LSTMWorkloadDefaults::NFEATS = 3; +const int LSTMWorkloadDefaults::NENCODED = 20; +const int LSTMWorkloadDefaults::NHID = 20; +const int LSTMWorkloadDefaults::NLAYERS = 2; +const float LSTMWorkloadDefaults::LR = 0.01f; +const float LSTMWorkloadDefaults::DROPOUT_RATE = 0.5f; +const float LSTMWorkloadDefaults::CLIP_NORM = 0.5f; +const int LSTMWorkloadDefaults::BATCH_SIZE = 12; +const int LSTMWorkloadDefaults::BPTT = 90; +const int LSTMWorkloadDefaults::EPOCHS = 100; + +const int LinearRegWorkloadDefaults::BPTT = 90; + +const int KernelRegWorkloadDefaults::BPTT = 90; + +} // namespace brain +} // namespace peloton diff --git a/src/include/brain/selectivity/augmented_nn.h b/src/include/brain/selectivity/augmented_nn.h new file mode 100644 index 00000000000..f39715b665f --- /dev/null +++ b/src/include/brain/selectivity/augmented_nn.h @@ -0,0 +1,108 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// augmented_nn.h +// +// Identification: src/include/brain/workload/augmented_nn.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include "brain/workload/base_tf.h" + +namespace peloton { +namespace brain { + +template +class TfSessionEntityInput; +template +class TfSessionEntityOutput; + +using TfFloatIn = TfSessionEntityInput; +using TfFloatOut = TfSessionEntityOutput; + +/** + * AugmentedNN is used to predict selectivity or cardinality + * for range predicates, e.g, + * SELECT * FROM table WHERE c1 >= l1 AND c1 <= u1 + * AND c2 >= l2 AND c2 <= u2 + * AND ... + * Input is [l1, u1, l2, u2, ...] + * Output is selectivity or cardinality + */ +class AugmentedNN : public BaseTFModel { + public: + AugmentedNN(int ncol, int order, int nneuron, + float learn_rate, int batch_size, int epochs); + /** + * Train the Tensorflow model + * @param mat: Training data + * @return: Average Training loss + * Given a matrix of training data, + * this function: + * 1. breaks the data into batches('Batchify') + * 2. prepares tensorflow-entity inputs/outputs + * 3. computes loss and applies backprop + * Finally the average training loss over all the + * batches is returned. + */ + float TrainEpoch(const matrix_eig &mat); + + /** + * @param mat: Validating data + * @return: Average Validation Loss + * This applies the same set of steps as TrainEpoch. + * However instead of applying backprop it obtains predicted values. + * Then the validation loss is calculated. + */ + float ValidateEpoch(const matrix_eig &mat); + + /** + * @param X: Input for the model + * @param y: Ground truth output corresponding to X + * @param bsz: Batchsize + * This function applies backprop once. + */ + void Fit(const matrix_eig &X, const matrix_eig &y, int bsz) override; + + /** + * @param X: Input for the model + * @param bsz: Batchsize + * This function gets prediction for the input from the model. + */ + matrix_eig Predict(const matrix_eig &X, int bsz) const override; + + /** + * @return std::string representing model object + */ + std::string ToString() const override; + int GetEpochs() const { return epochs_; } + int GetBatchsize() const { return batch_size_; } + private: + /** + * Utility function to create batches from the given data + * to be fed into the LSTM model + */ + void GetBatch(const matrix_eig &mat, size_t batch_offset, size_t bsz, + matrix_eig &data, matrix_eig &target); + // Function to generate the args string to feed the python model + std::string ConstructModelArgsString() const; + + // Attributes needed for the AugmentedNN model + // number of columns used as input + int column_num_; + // max order for activation function + int order_; + // number of neurons in hidden layer + int neuron_num_; + float learn_rate_; + int batch_size_; + int epochs_; +}; +} // namespace brain +} // namespace peloton diff --git a/src/include/brain/selectivity/selectivity_defaults.h b/src/include/brain/selectivity/selectivity_defaults.h new file mode 100644 index 00000000000..c7aad20e3a4 --- /dev/null +++ b/src/include/brain/selectivity/selectivity_defaults.h @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// selectivity_defaults.h +// +// Identification: src/include/brain/workload/selectivity_defaults.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +/** + * This header file contains default attributes + * associated with the selectivity prediction task + **/ + +namespace peloton { +namespace brain { + +struct AugmentedNNDefaults { + static const int COLUMN_NUM; + static const int ORDER; + static const int NEURON_NUM; + static const float LR; + static const int BATCH_SIZE; + static const int EPOCHS; +}; + +} // namespace brain +} // namespace peloton diff --git a/src/include/brain/workload/workload_defaults.h b/src/include/brain/workload/workload_defaults.h index 126cc50ca0b..82ca9604d4b 100644 --- a/src/include/brain/workload/workload_defaults.h +++ b/src/include/brain/workload/workload_defaults.h @@ -1,68 +1,69 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// workload_defaults.h -// -// Identification: src/include/brain/workload/workload_defaults.h -// -// Copyright (c) 2015-2018, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#pragma once - -/** - * This header file contains default attributes - * associated with the workload prediction task - **/ - -namespace peloton { -namespace brain { - -/** - * Common defaults(that should be uniform) across all models - * for the Workload Forecasting task - * // TODO(saatviks): SEGMENT/AGGREGATE not needed? - * // TODO(saatviks): Look into using a timer type(Default unit = minutes) - */ -struct CommonWorkloadDefaults { - static const int HORIZON; - static const int INTERVAL; - static const int PADDLING_DAYS; - // Early Stop parameters - static const int ESTOP_PATIENCE; - static const float ESTOP_DELTA; -}; - -/** - * LSTM Model defaults for Workload Forecasting task - */ -struct LSTMWorkloadDefaults { - static const int NFEATS; - static const int NENCODED; - static const int NHID; - static const int NLAYERS; - static const float LR; - static const float DROPOUT_RATE; - static const float CLIP_NORM; - static const int BATCH_SIZE; - static const int BPTT; - static const int EPOCHS; -}; - -/** - * LinearReg Model defaults for Workload Forecasting task - */ -struct LinearRegWorkloadDefaults { - static const int BPTT; -}; - -/** - * KernelReg Model defaults for Workload Forecasting task - */ -struct KernelRegWorkloadDefaults { - static const int BPTT; -}; -} // namespace brain -} // namespace peloton \ No newline at end of file +//===----------------------------------------------------------------------===// +// +// Peloton +// +// workload_defaults.h +// +// Identification: src/include/brain/workload/workload_defaults.h +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#pragma once + +/** + * This header file contains default attributes + * associated with the workload prediction task + **/ + +namespace peloton { +namespace brain { + +/** + * Common defaults(that should be uniform) across all models + * for the Workload Forecasting task + * // TODO(saatviks): SEGMENT/AGGREGATE not needed? + * // TODO(saatviks): Look into using a timer type(Default unit = minutes) + */ +struct CommonWorkloadDefaults { + static const int HORIZON; + static const int INTERVAL; + static const int PADDLING_DAYS; + // Early Stop parameters + static const int ESTOP_PATIENCE; + static const float ESTOP_DELTA; +}; + +/** + * LSTM Model defaults for Workload Forecasting task + */ +struct LSTMWorkloadDefaults { + static const int NFEATS; + static const int NENCODED; + static const int NHID; + static const int NLAYERS; + static const float LR; + static const float DROPOUT_RATE; + static const float CLIP_NORM; + static const int BATCH_SIZE; + static const int BPTT; + static const int EPOCHS; +}; + +/** + * LinearReg Model defaults for Workload Forecasting task + */ +struct LinearRegWorkloadDefaults { + static const int BPTT; +}; + +/** + * KernelReg Model defaults for Workload Forecasting task + */ +struct KernelRegWorkloadDefaults { + static const int BPTT; +}; + +} // namespace brain +} // namespace peloton diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index cc983ed5090..04802396492 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -50,6 +50,7 @@ set(TESTING_UTIL_INDEX ${PROJECT_SOURCE_DIR}/test/index/testing_index_util. set(TESTING_UTIL_CODEGEN ${PROJECT_SOURCE_DIR}/test/codegen/testing_codegen_util.cpp) set(TESTING_UTIL_FORECAST ${PROJECT_SOURCE_DIR}/test/brain/testing_forecast_util.cpp) set(TESTING_UTIL_OPTIMIZER ${PROJECT_SOURCE_DIR}/test/optimizer/optimizer_test_util.cpp) +set(TESTING_UTIL_AUGMENTEDNN ${PROJECT_SOURCE_DIR}/test/brain/testing_augmented_nn_util.cpp) add_library(peloton-test-common EXCLUDE_FROM_ALL ${gmock_srcs} ${HARNESS} ${TESTING_UTIL_EXECUTOR} @@ -62,6 +63,7 @@ add_library(peloton-test-common EXCLUDE_FROM_ALL ${gmock_srcs} ${HARNESS} ${TESTING_UTIL_CODEGEN} ${TESTING_UTIL_FORECAST} ${TESTING_UTIL_OPTIMIZER} + ${TESTING_UTIL_AUGMENTEDNN} ) # --[ Add "make check" target diff --git a/test/brain/augmented_nn_test.cpp b/test/brain/augmented_nn_test.cpp new file mode 100644 index 00000000000..c370771f552 --- /dev/null +++ b/test/brain/augmented_nn_test.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Peloton +// +// augmented_nn_test.cpp +// +// Identification: test/brain/augmented_nn_test.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include +#include "brain/selectivity/augmented_nn.h" +#include "brain/selectivity/selectivity_defaults.h" +#include "brain/testing_augmented_nn_util.h" +#include "brain/util/model_util.h" +#include "brain/workload/workload_defaults.h" +#include "common/harness.h" +#include "util/file_util.h" + +namespace peloton { +namespace test { +class AugmentedNNTests : public PelotonTest {}; + +TEST_F(AugmentedNNTests, DISABLED_AugmentedNNUniformTest) { + auto model = std::unique_ptr(new brain::AugmentedNN( + brain::AugmentedNNDefaults::COLUMN_NUM, + brain::AugmentedNNDefaults::ORDER, + brain::AugmentedNNDefaults::NEURON_NUM, + brain::AugmentedNNDefaults::LR, + brain::AugmentedNNDefaults::BATCH_SIZE, + brain::AugmentedNNDefaults::EPOCHS)); + EXPECT_TRUE(model->IsTFModel()); + size_t LOG_INTERVAL = 20; + size_t NUM_SAMPLES = 10000; + float VAL_SPLIT = 0.5; + bool NORMALIZE = false; + float VAL_THESH = 0.05; + + TestingAugmentedNNUtil::Test(*model, DistributionType::UniformDistribution, + LOG_INTERVAL, NUM_SAMPLES, + VAL_SPLIT, NORMALIZE, VAL_THESH, + brain::CommonWorkloadDefaults::ESTOP_PATIENCE, + brain::CommonWorkloadDefaults::ESTOP_DELTA); +} + +TEST_F(AugmentedNNTests, DISABLED_AugmentedNNSkewedTest) { + auto model = std::unique_ptr(new brain::AugmentedNN( + brain::AugmentedNNDefaults::COLUMN_NUM, + brain::AugmentedNNDefaults::ORDER, + brain::AugmentedNNDefaults::NEURON_NUM, + brain::AugmentedNNDefaults::LR, + brain::AugmentedNNDefaults::BATCH_SIZE, + brain::AugmentedNNDefaults::EPOCHS)); + EXPECT_TRUE(model->IsTFModel()); + size_t LOG_INTERVAL = 20; + size_t NUM_SAMPLES = 10000; + float VAL_SPLIT = 0.5; + bool NORMALIZE = false; + float VAL_THESH = 0.05; + + TestingAugmentedNNUtil::Test(*model, DistributionType::SkewedDistribution, + LOG_INTERVAL, NUM_SAMPLES, + VAL_SPLIT, NORMALIZE, VAL_THESH, + brain::CommonWorkloadDefaults::ESTOP_PATIENCE, + brain::CommonWorkloadDefaults::ESTOP_DELTA); +} + +} // namespace test +} // namespace peloton diff --git a/test/brain/model_test.cpp b/test/brain/model_test.cpp index 2615652cd9f..1a26f3ed603 100644 --- a/test/brain/model_test.cpp +++ b/test/brain/model_test.cpp @@ -1,139 +1,140 @@ -//===----------------------------------------------------------------------===// -// -// Peloton -// -// linear_models_test.cpp -// -// Identification: test/brain/linear_model_test.cpp -// -// Copyright (c) 2015-2018, Carnegie Mellon University Database Group -// -//===----------------------------------------------------------------------===// - -#include -#include "brain/testing_forecast_util.h" -#include "brain/util/model_util.h" -#include "brain/workload/kernel_model.h" -#include "brain/workload/linear_model.h" -#include "brain/workload/lstm.h" -#include "brain/workload/workload_defaults.h" -#include "common/harness.h" -#include "util/file_util.h" - -namespace peloton { -namespace test { -class ModelTests : public PelotonTest {}; - -TEST_F(ModelTests, NormalizerTest) { - auto n = brain::Normalizer(); - matrix_eig X = - brain::EigenUtil::ToEigenMat({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); - n.Fit(X); - matrix_eig Xrecon = n.ReverseTransform(n.Transform(X)); - EXPECT_TRUE(Xrecon.isApprox(X)); -} - -// Enable after resolving -TEST_F(ModelTests, DISABLED_TimeSeriesLSTMTest) { - auto model = std::unique_ptr(new brain::TimeSeriesLSTM( - brain::LSTMWorkloadDefaults::NFEATS, - brain::LSTMWorkloadDefaults::NENCODED, brain::LSTMWorkloadDefaults::NHID, - brain::LSTMWorkloadDefaults::NLAYERS, brain::LSTMWorkloadDefaults::LR, - brain::LSTMWorkloadDefaults::DROPOUT_RATE, - brain::LSTMWorkloadDefaults::CLIP_NORM, - brain::LSTMWorkloadDefaults::BATCH_SIZE, - brain::LSTMWorkloadDefaults::BPTT, brain::CommonWorkloadDefaults::HORIZON, - brain::CommonWorkloadDefaults::INTERVAL, - brain::LSTMWorkloadDefaults::EPOCHS)); - EXPECT_TRUE(model->IsTFModel()); - size_t LOG_INTERVAL = 20; - size_t NUM_SAMPLES = 1000; - size_t NUM_FEATS = 3; - float VAL_SPLIT = 0.5; - bool NORMALIZE = false; - float VAL_THESH = 0.05; - TestingForecastUtil::WorkloadTest(*model, WorkloadType::SimpleSinusoidal, - LOG_INTERVAL, NUM_SAMPLES, NUM_FEATS, - VAL_SPLIT, NORMALIZE, VAL_THESH, - brain::CommonWorkloadDefaults::ESTOP_PATIENCE, - brain::CommonWorkloadDefaults::ESTOP_DELTA); -} - -TEST_F(ModelTests, LinearRegTest) { - auto model = std::unique_ptr( - new brain::TimeSeriesLinearReg(brain::LinearRegWorkloadDefaults::BPTT, - brain::CommonWorkloadDefaults::HORIZON, - brain::CommonWorkloadDefaults::INTERVAL)); - EXPECT_FALSE(model->IsTFModel()); - size_t LOG_INTERVAL = 1; - size_t NUM_SAMPLES = 1000; - size_t NUM_FEATS = 3; - float VAL_SPLIT = 0.5; - bool NORMALIZE = true; - float VAL_THESH = 0.1; - TestingForecastUtil::WorkloadTest(*model, WorkloadType::NoisyLinear, - LOG_INTERVAL, NUM_SAMPLES, NUM_FEATS, - VAL_SPLIT, NORMALIZE, VAL_THESH, - brain::CommonWorkloadDefaults::ESTOP_PATIENCE, - brain::CommonWorkloadDefaults::ESTOP_DELTA); -} - -TEST_F(ModelTests, KernelRegTest) { - auto model = std::unique_ptr( - new brain::TimeSeriesKernelReg(brain::KernelRegWorkloadDefaults::BPTT, - brain::CommonWorkloadDefaults::HORIZON, - brain::CommonWorkloadDefaults::INTERVAL)); - EXPECT_FALSE(model->IsTFModel()); - size_t LOG_INTERVAL = 1; - size_t NUM_SAMPLES = 1000; - size_t NUM_FEATS = 3; - float VAL_SPLIT = 0.5; - bool NORMALIZE = true; - float VAL_THESH = 0.1; - TestingForecastUtil::WorkloadTest(*model, WorkloadType::NoisyLinear, - LOG_INTERVAL, NUM_SAMPLES, NUM_FEATS, - VAL_SPLIT, NORMALIZE, VAL_THESH, - brain::CommonWorkloadDefaults::ESTOP_PATIENCE, - brain::CommonWorkloadDefaults::ESTOP_DELTA); -} - -TEST_F(ModelTests, DISABLED_TimeSeriesEnsembleTest) { - auto lr_model = std::make_shared( - brain::LinearRegWorkloadDefaults::BPTT, - brain::CommonWorkloadDefaults::HORIZON, - brain::CommonWorkloadDefaults::INTERVAL); - auto kr_model = std::make_shared( - brain::KernelRegWorkloadDefaults::BPTT, - brain::CommonWorkloadDefaults::HORIZON, - brain::CommonWorkloadDefaults::INTERVAL); - auto lstm_model = std::make_shared( - brain::LSTMWorkloadDefaults::NFEATS, - brain::LSTMWorkloadDefaults::NENCODED, brain::LSTMWorkloadDefaults::NHID, - brain::LSTMWorkloadDefaults::NLAYERS, brain::LSTMWorkloadDefaults::LR, - brain::LSTMWorkloadDefaults::DROPOUT_RATE, - brain::LSTMWorkloadDefaults::CLIP_NORM, - brain::LSTMWorkloadDefaults::BATCH_SIZE, - brain::LSTMWorkloadDefaults::BPTT, brain::CommonWorkloadDefaults::HORIZON, - brain::CommonWorkloadDefaults::INTERVAL, - brain::LSTMWorkloadDefaults::EPOCHS); - - size_t LOG_INTERVAL = 20; - size_t NUM_SAMPLES = 1000; - size_t NUM_FEATS = 3; - float VAL_SPLIT = 0.5; - bool NORMALIZE = false; - float VAL_THESH = 0.06; - - auto model = - std::unique_ptr(new brain::TimeSeriesEnsemble( - {lr_model, kr_model, lstm_model}, {0.33f, 0.33f, 0.33}, - brain::LSTMWorkloadDefaults::BATCH_SIZE)); - TestingForecastUtil::WorkloadTest(*model, WorkloadType::SimpleSinusoidal, - LOG_INTERVAL, NUM_SAMPLES, NUM_FEATS, - VAL_SPLIT, NORMALIZE, VAL_THESH, - brain::CommonWorkloadDefaults::ESTOP_PATIENCE, - brain::CommonWorkloadDefaults::ESTOP_DELTA); -} - -} // namespace test -} // namespace peloton \ No newline at end of file +//===----------------------------------------------------------------------===// +// +// Peloton +// +// linear_models_test.cpp +// +// Identification: test/brain/linear_model_test.cpp +// +// Copyright (c) 2015-2018, Carnegie Mellon University Database Group +// +//===----------------------------------------------------------------------===// + +#include +#include "brain/testing_forecast_util.h" +#include "brain/util/model_util.h" +#include "brain/workload/kernel_model.h" +#include "brain/workload/linear_model.h" +#include "brain/workload/lstm.h" +#include "brain/workload/workload_defaults.h" +#include "common/harness.h" +#include "util/file_util.h" + +namespace peloton { +namespace test { +class ModelTests : public PelotonTest {}; + +TEST_F(ModelTests, NormalizerTest) { + auto n = brain::Normalizer(); + matrix_eig X = + brain::EigenUtil::ToEigenMat({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + n.Fit(X); + matrix_eig Xrecon = n.ReverseTransform(n.Transform(X)); + EXPECT_TRUE(Xrecon.isApprox(X)); +} + +// Enable after resolving +TEST_F(ModelTests, DISABLED_TimeSeriesLSTMTest) { + auto model = std::unique_ptr(new brain::TimeSeriesLSTM( + brain::LSTMWorkloadDefaults::NFEATS, + brain::LSTMWorkloadDefaults::NENCODED, brain::LSTMWorkloadDefaults::NHID, + brain::LSTMWorkloadDefaults::NLAYERS, brain::LSTMWorkloadDefaults::LR, + brain::LSTMWorkloadDefaults::DROPOUT_RATE, + brain::LSTMWorkloadDefaults::CLIP_NORM, + brain::LSTMWorkloadDefaults::BATCH_SIZE, + brain::LSTMWorkloadDefaults::BPTT, brain::CommonWorkloadDefaults::HORIZON, + brain::CommonWorkloadDefaults::INTERVAL, + brain::LSTMWorkloadDefaults::EPOCHS)); + EXPECT_TRUE(model->IsTFModel()); + size_t LOG_INTERVAL = 20; + size_t NUM_SAMPLES = 1000; + size_t NUM_FEATS = 3; + float VAL_SPLIT = 0.5; + bool NORMALIZE = false; + float VAL_THESH = 0.05; + TestingForecastUtil::WorkloadTest(*model, WorkloadType::SimpleSinusoidal, + LOG_INTERVAL, NUM_SAMPLES, NUM_FEATS, + VAL_SPLIT, NORMALIZE, VAL_THESH, + brain::CommonWorkloadDefaults::ESTOP_PATIENCE, + brain::CommonWorkloadDefaults::ESTOP_DELTA); +} + +TEST_F(ModelTests, LinearRegTest) { + auto model = std::unique_ptr( + new brain::TimeSeriesLinearReg(brain::LinearRegWorkloadDefaults::BPTT, + brain::CommonWorkloadDefaults::HORIZON, + brain::CommonWorkloadDefaults::INTERVAL)); + EXPECT_FALSE(model->IsTFModel()); + size_t LOG_INTERVAL = 1; + size_t NUM_SAMPLES = 1000; + size_t NUM_FEATS = 3; + float VAL_SPLIT = 0.5; + bool NORMALIZE = true; + float VAL_THESH = 0.1; + TestingForecastUtil::WorkloadTest(*model, WorkloadType::NoisyLinear, + LOG_INTERVAL, NUM_SAMPLES, NUM_FEATS, + VAL_SPLIT, NORMALIZE, VAL_THESH, + brain::CommonWorkloadDefaults::ESTOP_PATIENCE, + brain::CommonWorkloadDefaults::ESTOP_DELTA); +} + +TEST_F(ModelTests, KernelRegTest) { + auto model = std::unique_ptr( + new brain::TimeSeriesKernelReg(brain::KernelRegWorkloadDefaults::BPTT, + brain::CommonWorkloadDefaults::HORIZON, + brain::CommonWorkloadDefaults::INTERVAL)); + EXPECT_FALSE(model->IsTFModel()); + size_t LOG_INTERVAL = 1; + size_t NUM_SAMPLES = 1000; + size_t NUM_FEATS = 3; + float VAL_SPLIT = 0.5; + bool NORMALIZE = true; + float VAL_THESH = 0.1; + TestingForecastUtil::WorkloadTest(*model, WorkloadType::NoisyLinear, + LOG_INTERVAL, NUM_SAMPLES, NUM_FEATS, + VAL_SPLIT, NORMALIZE, VAL_THESH, + brain::CommonWorkloadDefaults::ESTOP_PATIENCE, + brain::CommonWorkloadDefaults::ESTOP_DELTA); +} + +TEST_F(ModelTests, DISABLED_TimeSeriesEnsembleTest) { + auto lr_model = std::make_shared( + brain::LinearRegWorkloadDefaults::BPTT, + brain::CommonWorkloadDefaults::HORIZON, + brain::CommonWorkloadDefaults::INTERVAL); + auto kr_model = std::make_shared( + brain::KernelRegWorkloadDefaults::BPTT, + brain::CommonWorkloadDefaults::HORIZON, + brain::CommonWorkloadDefaults::INTERVAL); + auto lstm_model = std::make_shared( + brain::LSTMWorkloadDefaults::NFEATS, + brain::LSTMWorkloadDefaults::NENCODED, brain::LSTMWorkloadDefaults::NHID, + brain::LSTMWorkloadDefaults::NLAYERS, brain::LSTMWorkloadDefaults::LR, + brain::LSTMWorkloadDefaults::DROPOUT_RATE, + brain::LSTMWorkloadDefaults::CLIP_NORM, + brain::LSTMWorkloadDefaults::BATCH_SIZE, + brain::LSTMWorkloadDefaults::BPTT, brain::CommonWorkloadDefaults::HORIZON, + brain::CommonWorkloadDefaults::INTERVAL, + brain::LSTMWorkloadDefaults::EPOCHS); + + size_t LOG_INTERVAL = 20; + size_t NUM_SAMPLES = 1000; + size_t NUM_FEATS = 3; + float VAL_SPLIT = 0.5; + bool NORMALIZE = false; + float VAL_THESH = 0.06; + + auto model = + std::unique_ptr(new brain::TimeSeriesEnsemble( + {lr_model, kr_model, lstm_model}, {0.33f, 0.33f, 0.33}, + brain::LSTMWorkloadDefaults::BATCH_SIZE)); + TestingForecastUtil::WorkloadTest(*model, WorkloadType::SimpleSinusoidal, + LOG_INTERVAL, NUM_SAMPLES, NUM_FEATS, + VAL_SPLIT, NORMALIZE, VAL_THESH, + brain::CommonWorkloadDefaults::ESTOP_PATIENCE, + brain::CommonWorkloadDefaults::ESTOP_DELTA); +} + + +} // namespace test +} // namespace peloton diff --git a/test/brain/testing_augmented_nn_util.cpp b/test/brain/testing_augmented_nn_util.cpp new file mode 100644 index 00000000000..648ec7dca3c --- /dev/null +++ b/test/brain/testing_augmented_nn_util.cpp @@ -0,0 +1,297 @@ +#include "brain/testing_augmented_nn_util.h" +#include +#include "brain/util/model_util.h" +#include "brain/util/eigen_util.h" +#include "common/harness.h" +#include + +namespace peloton { +namespace test { + +void TestingAugmentedNNUtil::Test( + brain::AugmentedNN &model, DistributionType d, + size_t val_interval, size_t num_samples, + float val_split, bool normalize, float val_loss_thresh, + size_t early_stop_patience, float early_stop_delta) { + LOG_INFO("Using Model: %s", model.ToString().c_str()); + size_t num_tests = model.GetBatchsize(); + matrix_eig all_data = GetData(d, num_samples, num_tests); + + matrix_eig test_data = all_data.bottomRows(num_tests*3); + matrix_eig data = all_data.topRows(all_data.rows() - num_tests*3); + + brain::Normalizer n(normalize); + val_interval = std::min(val_interval, model.GetEpochs()); + + // Determine the split point + size_t split_point = + data.rows() - static_cast(data.rows() * val_split); + + // Split into train/validate data + matrix_eig train_data = data.topRows(split_point); + n.Fit(train_data); + train_data = n.Transform(train_data); + matrix_eig validate_data = + n.Transform(data.bottomRows( + static_cast(data.rows() - split_point))); + + vector_eig train_loss_avg = vector_eig::Zero(val_interval); + float prev_train_loss = std::numeric_limits::max(); + float val_loss = val_loss_thresh * 2; + std::vector val_losses; + for (int epoch = 1; epoch <= model.GetEpochs() && + !brain::ModelUtil::EarlyStop( + val_losses, early_stop_patience, early_stop_delta); + epoch++) { + auto train_loss = model.TrainEpoch(train_data); + size_t idx = (epoch - 1) % val_interval; + train_loss_avg(idx) = train_loss; + if (epoch % val_interval == 0) { + val_loss = model.ValidateEpoch(validate_data); + train_loss = train_loss_avg.mean(); + EXPECT_LE(train_loss, prev_train_loss); + LOG_DEBUG("Train Loss: %.10f, Valid Loss: %.10f", train_loss, val_loss); + prev_train_loss = train_loss; + } + } + EXPECT_LE(val_loss, val_loss_thresh); + + matrix_eig check_data = + test_data.block(0, 0, test_data.rows(), test_data.cols() - 1); + matrix_eig check_target_data = + test_data.block(0, test_data.cols() - 1, test_data.rows(), 1); + + matrix_eig test_res = model.Predict(check_data, num_tests*3); + + LOG_INFO("Test with on high end: "); + for (size_t i = 0; i < 10; i++) { + LOG_INFO("Truth: %.8f, Pred: %.8f", + check_target_data(i,0), test_res(i,0)); + } + float test_loss = peloton::brain::ModelUtil::MeanSqError( + check_target_data.topRows(num_tests), + test_res.topRows(num_tests)); + LOG_INFO("AMSE: %.8f", test_loss); + + LOG_INFO("Test with on low end: "); + for (size_t i = num_tests; i < num_tests + 10; i++) { + LOG_INFO("Truth: %.8f, Pred: %.8f", + check_target_data(i,0), test_res(i,0)); + } + test_loss = peloton::brain::ModelUtil::MeanSqError( + check_target_data.middleRows(num_tests, num_tests), + test_res.middleRows(num_tests, num_tests)); + LOG_INFO("AMSE: %.8f", test_loss); + + LOG_INFO("Test randomly: "); + for (size_t i = 2 * num_tests; i < 2 * num_tests + 10; i++) { + LOG_INFO("Truth: %.8f, Pred: %.8f", + check_target_data(i,0), test_res(i,0)); + } + test_loss = peloton::brain::ModelUtil::MeanSqError( + check_target_data.bottomRows(num_tests), + test_res.bottomRows(num_tests)); + LOG_INFO("AMSE: %.8f", test_loss); + +} + + +matrix_eig TestingAugmentedNNUtil::GetData(DistributionType d, + size_t num_samples, + size_t num_tests) { + matrix_eig data; + switch (d) { + case DistributionType::UniformDistribution: { + int NUM_X = 1000; + matrix_eig hist = matrix_eig::Zero(NUM_X + 1, 1); + matrix_eig sum = matrix_eig::Zero(NUM_X + 1, 1); + float sum_hist = 0; + for (int i = 1; i <= NUM_X; i++) { + hist(i, 0) = 100; + } + + for (int i = 1; i <= NUM_X; i++) { + sum(i, 0) = sum(i - 1, 0) + hist(i, 0); + } + sum_hist = sum(NUM_X, 0); + + // generate training and validating data randomly + data = matrix_eig::Zero(num_samples, 3); //3:lowerbound, upperbound, sel + std::mt19937 rng; + rng.seed(std::random_device()()); + std::uniform_int_distribution dist(1, NUM_X); + + // data: [lowerbound, upperbound, truth selectivity] + for (size_t i = 0; i < num_samples; i++) { + int l = dist(rng); + int u = dist(rng); + if (l > u) { + std::swap(l, u); + } + float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + // assume the max&min values of the col are known + // so here preprocessing([min,max]->[-1,1]) can be done + data(i, 0) = (2 / (float)NUM_X) * l - 1; + data(i, 1) = (2 / (float)NUM_X) * u - 1; + data(i, 2) = sel; + } + + float HIGH_SEL = 0.8; + float LOW_SEL = 0.2; + + matrix_eig test_random_data = matrix_eig::Zero(num_tests, 3); + matrix_eig test_highsel_data = matrix_eig::Zero(num_tests, 3); + matrix_eig test_lowsel_data = matrix_eig::Zero(num_tests, 3); + + // generate test data with high selectivity + for (size_t i = 0; i < num_tests; i++) { + int l, u; + float sel; + do { + l = dist(rng); + u = dist(rng); + if (l > u) { + std::swap(l, u); + } + sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + } while(sel <= HIGH_SEL); + test_highsel_data(i, 0) = (2 / (float)NUM_X) * l - 1; + test_highsel_data(i, 1) = (2 / (float)NUM_X) * u - 1; + test_highsel_data(i, 2) = sel; + } + + // generate test data with low selectivity + for (size_t i = 0; i < num_tests; i++) { + int l, u; + float sel; + do { + l = dist(rng); + u = dist(rng); + if (l > u) { + std::swap(l, u); + } + sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + } while(sel >= LOW_SEL); + test_lowsel_data(i, 0) = (2 / (float)NUM_X) * l - 1; + test_lowsel_data(i, 1) = (2 / (float)NUM_X) * u - 1; + test_lowsel_data(i, 2) = sel; + } + + // generate test data with random selectivity + for (size_t i = 0; i < num_tests; i++) { + int l = dist(rng); + int u = dist(rng); + if (l > u) { + std::swap(l, u); + } + float sel = (sum(u, 0) - sum(l - 1,0)) / sum_hist; + test_random_data(i, 0) = (2 / (float)NUM_X) * l - 1; + test_random_data(i, 1) = (2 / (float)NUM_X) * u - 1; + test_random_data(i, 2) = sel; + } + + std::vector data_vec = {data, test_highsel_data, + test_lowsel_data, test_random_data}; + data = peloton::brain::EigenUtil::VStack(data_vec); + + break; + } + case DistributionType::SkewedDistribution: { + // generate skewed dataset + int NUM_X = 1000; + matrix_eig hist = matrix_eig::Zero(NUM_X + 1, 1); + matrix_eig sum = matrix_eig::Zero(NUM_X + 1, 1); + float sum_hist = 0; + + // skewed + for (int i = 1; i < 100; i++) { + hist(i, 0) = 2 + std::round(100 * + std::exp(-0.001 * std::pow(i - 100.0, 2))); + } + for (int i = 100; i <= NUM_X; i++) { + hist(i, 0) = 2 + std::round(100 * + std::exp(-0.00008 * std::pow(i - 100.0, 2))); + } + + for (int i = 1; i <= NUM_X; i++) { + sum(i, 0) = sum(i - 1, 0) + hist(i, 0); + } + sum_hist = sum(NUM_X, 0); + + // generate training and testing data randomly + data = matrix_eig::Zero(num_samples, 3); + std::mt19937 rng; + rng.seed(std::random_device()()); + std::uniform_int_distribution dist(1, NUM_X); + + // data: [lowerbound, upperbound, truth selectivity] + for (size_t i = 0; i < num_samples; i++) { + int l = dist(rng); + int u = dist(rng); + if (l > u) { + std::swap(l, u); + } + float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + // assume the max&min values of the col are known + // so here preprocessing([min,max]->[-1,1]) can be done + data(i, 0) = (2 / (float)NUM_X) * l - 1; + data(i, 1) = (2 / (float)NUM_X) * u - 1; + data(i, 2) = sel; + } + matrix_eig test_lowsel_data = matrix_eig::Zero(num_tests, 3); + matrix_eig test_highsel_data = matrix_eig::Zero(num_tests, 3); + matrix_eig test_random_data = matrix_eig::Zero(num_tests, 3); + std::uniform_int_distribution dist_low(300, 999); + std::uniform_int_distribution dist_high(50, 150); + + // generate test data on the low end + for (size_t i = 0; i < num_tests; i++) { + int l = dist_low(rng); + int u = dist_low(rng); + if (l > u) { + std::swap(l, u); + } + float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + test_lowsel_data(i, 0) = (2 / (float)NUM_X) * l - 1; + test_lowsel_data(i, 1) = (2 / (float)NUM_X) * u - 1; + test_lowsel_data(i, 2) = sel; + } + + // generate test data on the high end + for (size_t i = 0; i < num_tests; i++) { + int l = dist_high(rng); + int u = dist_high(rng); + if (l > u) { + std::swap(l, u); + } + float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + test_highsel_data(i, 0) = (2 / (float)NUM_X) * l - 1; + test_highsel_data(i, 1) = (2 / (float)NUM_X) * u - 1; + test_highsel_data(i, 2) = sel; + } + + // generate test data randomly + for (size_t i = 0; i < num_tests; i++) { + int l = dist(rng); + int u = dist(rng); + if (l > u) { + std::swap(l, u); + } + float sel = (sum(u, 0) - sum(l - 1, 0)) / sum_hist; + test_random_data(i, 0) = (2 / (float)NUM_X) * l - 1; + test_random_data(i, 1) = (2 / (float)NUM_X) * u - 1; + test_random_data(i, 2) = sel; + } + + std::vector data_vec = {data, test_highsel_data, + test_lowsel_data, test_random_data}; + data = peloton::brain::EigenUtil::VStack(data_vec); + break; + } + } + return data; +} + + +} // namespace test +} // namespace peloton diff --git a/test/brain/testing_forecast_util.cpp b/test/brain/testing_forecast_util.cpp index 07f9383bd93..d5a37b60f74 100644 --- a/test/brain/testing_forecast_util.cpp +++ b/test/brain/testing_forecast_util.cpp @@ -1,7 +1,9 @@ #include "brain/testing_forecast_util.h" #include #include "brain/util/model_util.h" +#include "brain/util/eigen_util.h" #include "common/harness.h" +#include namespace peloton { namespace test { @@ -24,7 +26,9 @@ void TestingForecastUtil::WorkloadTest( matrix_eig train_data = data.topRows(split_point); n.Fit(train_data); train_data = n.Transform(train_data); - matrix_eig test_data = n.Transform(data.bottomRows(split_point)); + matrix_eig test_data = + n.Transform(data.bottomRows( + static_cast(data.rows() - split_point))); vector_eig train_loss_avg = vector_eig::Zero(val_interval); float prev_train_loss = std::numeric_limits::max(); @@ -110,5 +114,6 @@ matrix_eig TestingForecastUtil::GetWorkload(WorkloadType w, size_t num_samples, return data; } + } // namespace test -} // namespace peloton \ No newline at end of file +} // namespace peloton diff --git a/test/include/brain/testing_augmented_nn_util.h b/test/include/brain/testing_augmented_nn_util.h new file mode 100644 index 00000000000..0c45d67fc7a --- /dev/null +++ b/test/include/brain/testing_augmented_nn_util.h @@ -0,0 +1,30 @@ +#pragma once + +#include "common/internal_types.h" +#include "brain/util/eigen_util.h" +#include "brain/workload/base_tf.h" +#include "brain/selectivity/augmented_nn.h" + +namespace peloton{ +namespace test{ + +enum class DistributionType{ UniformDistribution, SkewedDistribution }; + +class TestingAugmentedNNUtil{ + public: + static void Test(brain::AugmentedNN& model, + DistributionType d, size_t val_interval, + size_t num_samples = 1000, + float val_split = 0.5, + bool normalize = false, + float val_loss_thresh = 0.06, + size_t early_stop_patience = 10, + float early_stop_delta = 0.01); +// private: + static matrix_eig GetData(DistributionType d, + size_t num_samples, size_t num_tests); +}; + + +} +} diff --git a/test/include/brain/testing_forecast_util.h b/test/include/brain/testing_forecast_util.h index 1c3d6bee168..01750612a9c 100644 --- a/test/include/brain/testing_forecast_util.h +++ b/test/include/brain/testing_forecast_util.h @@ -34,5 +34,6 @@ class TestingForecastUtil{ size_t num_feats); }; + +} } -} \ No newline at end of file