Skip to content

Commit

Permalink
Merge branch 'staging' into andreas/python38
Browse files Browse the repository at this point in the history
  • Loading branch information
anargyri authored Jan 21, 2022
2 parents 8cfbc4a + 02451f1 commit cf7bfd8
Show file tree
Hide file tree
Showing 8 changed files with 307 additions and 109 deletions.
142 changes: 99 additions & 43 deletions examples/00_quick_start/rbm_movielens.ipynb

Large diffs are not rendered by default.

72 changes: 45 additions & 27 deletions examples/02_model_collaborative_filtering/rbm_deep_dive.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@
"NUM_ITERATIONS = 50\n",
"LEARNING_RATE = 0.1\n",
"FEATURE_FRACTION = 0.8\n",
"EARLY_STOPPING_ROUND = 10\n",
"\n",
"# Model name\n",
"MODEL_NAME = 'lightgbm_criteo.mml'"
Expand Down Expand Up @@ -321,8 +320,7 @@
"- `numLeaves`: the number of leaves in each tree\n",
"- `numIterations`: the number of iterations to apply boosting\n",
"- `learningRate`: the learning rate for training across trees\n",
"- `featureFraction`: the fraction of features used for training a tree\n",
"- `earlyStoppingRound`: round at which early stopping can be applied to avoid overfitting"
"- `featureFraction`: the fraction of features used for training a tree"
]
},
{
Expand All @@ -342,8 +340,7 @@
" numLeaves=NUM_LEAVES,\n",
" numIterations=NUM_ITERATIONS,\n",
" learningRate=LEARNING_RATE,\n",
" featureFraction=FEATURE_FRACTION,\n",
" earlyStoppingRound=EARLY_STOPPING_ROUND\n",
" featureFraction=FEATURE_FRACTION\n",
")"
]
},
Expand Down
97 changes: 70 additions & 27 deletions recommenders/models/rbm/rbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import numpy as np
import tensorflow as tf
import logging
import os
from pathlib import Path

tf.compat.v1.disable_eager_execution()
log = logging.getLogger(__name__)
Expand All @@ -14,6 +16,9 @@ class RBM:

def __init__(
self,
n_users,
possible_ratings,
visible_units,
hidden_units=500,
keep_prob=0.7,
init_stdv=0.1,
Expand Down Expand Up @@ -101,6 +106,33 @@ def __init__(
np.random.seed(self.seed)
tf.compat.v1.set_random_seed(self.seed)

self.n_visible = visible_units # number of items
self.n_users = n_users # number of users

tf.compat.v1.reset_default_graph()

# ----------------------Initializers-------------------------------------

# create a sorted list of all the unique ratings (of float type)
self.possible_ratings = possible_ratings

# create a lookup table to map integer indices to float ratings
self.ratings_lookup_table = tf.lookup.StaticHashTable(
tf.lookup.KeyValueTensorInitializer(
tf.constant(list(range(len(self.possible_ratings))), dtype=tf.int32),
tf.constant(list(self.possible_ratings), dtype=tf.float32),
), default_value=0
)

self.generate_graph()
self.init_metrics()
self.init_gpu()
init_graph = tf.compat.v1.global_variables_initializer()

# Start TF training session on default graph
self.sess = tf.compat.v1.Session(config=self.config_gpu)
self.sess.run(init_graph)

def binomial_sampling(self, pr):
"""Binomial sampling of hidden units activations using a rejection method.
Expand Down Expand Up @@ -482,12 +514,6 @@ def init_training_session(self, xtr):
xtr (numpy.ndarray, int32): The user/affinity matrix for the train set.
"""

init_graph = tf.compat.v1.global_variables_initializer()

# Start TF training session on default graph
self.sess = tf.compat.v1.Session(config=self.config_gpu)
self.sess.run(init_graph)

self.sess.run(
self.iter.initializer,
feed_dict={self.vu: xtr, self.batch_size: self.minibatch},
Expand Down Expand Up @@ -522,7 +548,7 @@ def batch_training(self, num_minibatches):

return epoch_tr_err

def fit(self, xtr, xtst):
def fit(self, xtr):
"""Fit method
Training in generative models takes place in two steps:
Expand All @@ -545,27 +571,8 @@ def fit(self, xtr, xtst):
# keep the position of the items in the train set so that they can be optionally exluded from recommendation
self.seen_mask = np.not_equal(xtr, 0)

m, self.n_visible = xtr.shape # m= # users, n_visible= # items
num_minibatches = int(m / self.minibatch) # number of minibatches

tf.compat.v1.reset_default_graph()

# ----------------------Initializers-------------------------------------

# create a sorted list of all the unique ratings (of float type)
self.possible_ratings = np.sort(np.setdiff1d(np.unique(xtr), np.array([0])))

# create a lookup table to map integer indices to float ratings
self.ratings_lookup_table = tf.lookup.StaticHashTable(
tf.lookup.KeyValueTensorInitializer(
tf.constant(list(range(len(self.possible_ratings))), dtype=tf.int32),
tf.constant(list(self.possible_ratings), dtype=tf.float32),
), default_value=0
)
num_minibatches = int(self.n_users / self.minibatch) # number of minibatches

self.generate_graph()
self.init_metrics()
self.init_gpu()
self.init_training_session(xtr)

rmse_train = [] # List to collect the metrics across epochs
Expand Down Expand Up @@ -691,3 +698,39 @@ def predict(self, x):
vp = self.sess.run(v_, feed_dict={self.vu: x})

return vp

def save(self, file_path='./rbm_model.ckpt'):
"""Save model parameters to `file_path`
This function saves the current tensorflow session to a specified path.
Args:
file_path (str): output file path for the RBM model checkpoint
we will create a new directory if not existing.
"""

f_path = Path(file_path)
dir_name, file_name = f_path.parent, f_path.name

# create the directory if it does not exist
os.makedirs(dir_name, exist_ok=True)

# save trained model
saver = tf.compat.v1.train.Saver()
saver.save(self.sess, os.path.join(dir_name, file_name))

def load(self, file_path='./rbm_model.ckpt'):
"""Load model parameters for further use.
This function loads a saved tensorflow session.
Args:
file_path (str): file path for RBM model checkpoint
"""

f_path = Path(file_path)
dir_name, file_name = f_path.parent, f_path.name

# load pre-trained model
saver = tf.compat.v1.train.Saver()
saver.restore(self.sess, os.path.join(dir_name, file_name))
2 changes: 1 addition & 1 deletion tests/integration/examples/test_notebooks_pyspark.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def test_mmlspark_lightgbm_criteo_integration(notebooks, output_notebook, kernel
notebook_path,
output_notebook,
kernel_name=kernel_name,
parameters=dict(DATA_SIZE="full", NUM_ITERATIONS=50, EARLY_STOPPING_ROUND=10),
parameters=dict(DATA_SIZE="full", NUM_ITERATIONS=50),
)
results = sb.read_notebook(output_notebook).scraps.dataframe.set_index("name")[
"data"
Expand Down
2 changes: 1 addition & 1 deletion tests/smoke/examples/test_notebooks_pyspark.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def test_mmlspark_lightgbm_criteo_smoke(notebooks, output_notebook, kernel_name)
notebook_path,
output_notebook,
kernel_name=kernel_name,
parameters=dict(DATA_SIZE="sample", NUM_ITERATIONS=50, EARLY_STOPPING_ROUND=10),
parameters=dict(DATA_SIZE="sample", NUM_ITERATIONS=50),
)

results = sb.read_notebook(output_notebook).scraps.dataframe.set_index("name")[
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/examples/test_notebooks_pyspark.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,5 +132,5 @@ def test_mmlspark_lightgbm_criteo_runs(notebooks, output_notebook, kernel_name):
notebook_path,
output_notebook,
kernel_name=kernel_name,
parameters=dict(DATA_SIZE="sample", NUM_ITERATIONS=10, EARLY_STOPPING_ROUND=2),
parameters=dict(DATA_SIZE="sample", NUM_ITERATIONS=10),
)
92 changes: 88 additions & 4 deletions tests/unit/recommenders/models/test_rbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
@pytest.fixture(scope="module")
def init_rbm():
return {
"n_users": 5000,
"possible_ratings": [1, 2, 3, 4, 5],
"n_visible": 500,
"n_hidden": 100,
"epochs": 10,
"minibatch": 50,
Expand All @@ -27,6 +30,9 @@ def init_rbm():
@pytest.mark.gpu
def test_class_init(init_rbm):
model = RBM(
n_users=init_rbm["n_users"],
possible_ratings=init_rbm["possible_ratings"],
visible_units=init_rbm["n_visible"],
hidden_units=init_rbm["n_hidden"],
training_epoch=init_rbm["epochs"],
minibatch_size=init_rbm["minibatch"],
Expand All @@ -37,6 +43,12 @@ def test_class_init(init_rbm):
display_epoch=init_rbm["display_epoch"],
)

# number of users
assert model.n_users == init_rbm["n_users"]
# list of unique rating values
assert np.array_equal(model.possible_ratings, init_rbm["possible_ratings"])
# number of visible units
assert model.n_visible == init_rbm["n_visible"]
# number of hidden units
assert model.n_hidden == init_rbm["n_hidden"]
# number of training epochs
Expand All @@ -58,16 +70,19 @@ def test_class_init(init_rbm):
@pytest.mark.gpu
def test_train_param_init(init_rbm, affinity_matrix):
# obtain the train/test set matrices
Xtr, Xtst = affinity_matrix
Xtr, _ = affinity_matrix

# initialize the model
model = RBM(
n_users=Xtr.shape[0],
possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),
visible_units=Xtr.shape[1],
hidden_units=init_rbm["n_hidden"],
training_epoch=init_rbm["epochs"],
minibatch_size=init_rbm["minibatch"],
)
# fit the model to the data
model.fit(Xtr, Xtst)
model.fit(Xtr)

# visible units placeholder (tensor)
model.vu.shape[1] == Xtr.shape[1]
Expand All @@ -82,10 +97,13 @@ def test_train_param_init(init_rbm, affinity_matrix):
@pytest.mark.gpu
def test_sampling_funct(init_rbm, affinity_matrix):
# obtain the train/test set matrices
Xtr, Xtst = affinity_matrix
Xtr, _ = affinity_matrix

# initialize the model
model = RBM(
n_users=Xtr.shape[0],
possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),
visible_units=Xtr.shape[1],
hidden_units=init_rbm["n_hidden"],
training_epoch=init_rbm["epochs"],
minibatch_size=init_rbm["minibatch"],
Expand All @@ -106,7 +124,7 @@ def check_sampled_values(sampled, s):
r = Xtr.max() # obtain the rating scale

# fit the model to the data
model.fit(Xtr, Xtst)
model.fit(Xtr)

# evaluate the activation probabilities of the hidden units and their sampled values
phv, h = model.sess.run(model.sample_hidden_units(model.v))
Expand All @@ -132,3 +150,69 @@ def check_sampled_values(sampled, s):

# check that the sampled values of the visible units is in [0,r]
assert check_sampled_values(v_sampled, r).all()

@pytest.mark.gpu
def test_save_load(init_rbm, affinity_matrix):

# obtain the train/test set matrices
Xtr, _ = affinity_matrix

# initialize the model
original_model = RBM(
n_users=Xtr.shape[0],
possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),
visible_units=Xtr.shape[1],
hidden_units=init_rbm["n_hidden"],
training_epoch=init_rbm["epochs"],
minibatch_size=init_rbm["minibatch"],
keep_prob=init_rbm["keep_prob"],
learning_rate=init_rbm["learning_rate"],
init_stdv=init_rbm["init_stdv"],
sampling_protocol=init_rbm["sampling_protocol"],
display_epoch=init_rbm["display_epoch"],
)

# save the model
original_model.save()

# initialize another model
saved_model = RBM(
n_users=Xtr.shape[0],
possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),
visible_units=Xtr.shape[1],
hidden_units=init_rbm["n_hidden"],
training_epoch=init_rbm["epochs"],
minibatch_size=init_rbm["minibatch"],
keep_prob=init_rbm["keep_prob"],
learning_rate=init_rbm["learning_rate"],
init_stdv=init_rbm["init_stdv"],
sampling_protocol=init_rbm["sampling_protocol"],
display_epoch=init_rbm["display_epoch"],
)

# load the pretrained model
saved_model.load()

# number of users
assert saved_model.n_users == original_model.n_users
# list of unique rating values
assert np.array_equal(saved_model.possible_ratings, original_model.possible_ratings)
# number of visible units
assert saved_model.n_visible == original_model.n_visible
# number of hidden units
assert saved_model.n_hidden == original_model.n_hidden
# number of training epochs
assert saved_model.epochs == original_model.epochs
# minibatch size
assert saved_model.minibatch == original_model.minibatch
# keep probability for dropout regularization
assert saved_model.keep == original_model.keep
# learning rate
assert saved_model.learning_rate == original_model.learning_rate
# standard deviation used to initialize the weight matrix from a normal distribution
assert saved_model.stdv == original_model.stdv
# sampling protocol used to increase the number of steps in Gibbs sampling
assert saved_model.sampling_protocol == original_model.sampling_protocol
# number of epochs after which the rmse is displayed
assert saved_model.display_epoch == original_model.display_epoch

0 comments on commit cf7bfd8

Please sign in to comment.