Merge branch 'staging' into andreas/python38

recommenders-team · Jan 21, 2022 · cf7bfd8 · cf7bfd8
2 parents 8cfbc4a + 02451f1
commit cf7bfd8
Show file tree

Hide file tree

Showing 8 changed files with 307 additions and 109 deletions.
diff --git a/examples/00_quick_start/rbm_movielens.ipynb b/examples/00_quick_start/rbm_movielens.ipynb
diff --git a/examples/02_model_collaborative_filtering/rbm_deep_dive.ipynb b/examples/02_model_collaborative_filtering/rbm_deep_dive.ipynb
diff --git a/examples/02_model_content_based_filtering/mmlspark_lightgbm_criteo.ipynb b/examples/02_model_content_based_filtering/mmlspark_lightgbm_criteo.ipynb
@@ -115,7 +115,6 @@
     "NUM_ITERATIONS = 50\n",
     "LEARNING_RATE = 0.1\n",
     "FEATURE_FRACTION = 0.8\n",
-    "EARLY_STOPPING_ROUND = 10\n",
     "\n",
     "# Model name\n",
     "MODEL_NAME = 'lightgbm_criteo.mml'"
@@ -321,8 +320,7 @@
     "- `numLeaves`: the number of leaves in each tree\n",
     "- `numIterations`: the number of iterations to apply boosting\n",
     "- `learningRate`: the learning rate for training across trees\n",
-    "- `featureFraction`: the fraction of features used for training a tree\n",
-    "- `earlyStoppingRound`: round at which early stopping can be applied to avoid overfitting"
+    "- `featureFraction`: the fraction of features used for training a tree"
    ]
   },
   {
@@ -342,8 +340,7 @@
     "    numLeaves=NUM_LEAVES,\n",
     "    numIterations=NUM_ITERATIONS,\n",
     "    learningRate=LEARNING_RATE,\n",
-    "    featureFraction=FEATURE_FRACTION,\n",
-    "    earlyStoppingRound=EARLY_STOPPING_ROUND\n",
+    "    featureFraction=FEATURE_FRACTION\n",
     ")"
    ]
   },

diff --git a/recommenders/models/rbm/rbm.py b/recommenders/models/rbm/rbm.py
@@ -4,6 +4,8 @@
 import numpy as np
 import tensorflow as tf
 import logging
+import os
+from pathlib import Path
 
 tf.compat.v1.disable_eager_execution()
 log = logging.getLogger(__name__)
@@ -14,6 +16,9 @@ class RBM:
 
     def __init__(
         self,
+        n_users,
+        possible_ratings,
+        visible_units,
         hidden_units=500,
         keep_prob=0.7,
         init_stdv=0.1,
@@ -101,6 +106,33 @@ def __init__(
         np.random.seed(self.seed)
         tf.compat.v1.set_random_seed(self.seed)
 
+        self.n_visible = visible_units  # number of items
+        self.n_users = n_users  # number of users
+
+        tf.compat.v1.reset_default_graph()
+
+        # ----------------------Initializers-------------------------------------
+
+        # create a sorted list of all the unique ratings (of float type)
+        self.possible_ratings = possible_ratings
+
+        # create a lookup table to map integer indices to float ratings
+        self.ratings_lookup_table = tf.lookup.StaticHashTable(
+            tf.lookup.KeyValueTensorInitializer(
+                tf.constant(list(range(len(self.possible_ratings))), dtype=tf.int32),
+                tf.constant(list(self.possible_ratings), dtype=tf.float32),
+            ), default_value=0
+        )
+
+        self.generate_graph()
+        self.init_metrics()
+        self.init_gpu()
+        init_graph = tf.compat.v1.global_variables_initializer()
+
+        # Start TF training session on default graph
+        self.sess = tf.compat.v1.Session(config=self.config_gpu)
+        self.sess.run(init_graph)
+
     def binomial_sampling(self, pr):
         """Binomial sampling of hidden units activations using a rejection method.
 
@@ -482,12 +514,6 @@ def init_training_session(self, xtr):
             xtr (numpy.ndarray, int32): The user/affinity matrix for the train set.
         """
 
-        init_graph = tf.compat.v1.global_variables_initializer()
-
-        # Start TF training session on default graph
-        self.sess = tf.compat.v1.Session(config=self.config_gpu)
-        self.sess.run(init_graph)
-
         self.sess.run(
             self.iter.initializer,
             feed_dict={self.vu: xtr, self.batch_size: self.minibatch},
@@ -522,7 +548,7 @@ def batch_training(self, num_minibatches):
 
         return epoch_tr_err
 
-    def fit(self, xtr, xtst):
+    def fit(self, xtr):
         """Fit method
 
         Training in generative models takes place in two steps:
@@ -545,27 +571,8 @@ def fit(self, xtr, xtst):
         # keep the position of the items in the train set so that they can be optionally exluded from recommendation
         self.seen_mask = np.not_equal(xtr, 0)
 
-        m, self.n_visible = xtr.shape  # m= # users, n_visible= # items
-        num_minibatches = int(m / self.minibatch)  # number of minibatches
-
-        tf.compat.v1.reset_default_graph()
-
-        # ----------------------Initializers-------------------------------------
-
-        # create a sorted list of all the unique ratings (of float type)
-        self.possible_ratings = np.sort(np.setdiff1d(np.unique(xtr), np.array([0])))
-
-        # create a lookup table to map integer indices to float ratings
-        self.ratings_lookup_table = tf.lookup.StaticHashTable(
-            tf.lookup.KeyValueTensorInitializer(
-                tf.constant(list(range(len(self.possible_ratings))), dtype=tf.int32),
-                tf.constant(list(self.possible_ratings), dtype=tf.float32),
-            ), default_value=0
-        )
+        num_minibatches = int(self.n_users / self.minibatch)  # number of minibatches
 
-        self.generate_graph()
-        self.init_metrics()
-        self.init_gpu()
         self.init_training_session(xtr)
 
         rmse_train = []  # List to collect the metrics across epochs
@@ -691,3 +698,39 @@ def predict(self, x):
         vp = self.sess.run(v_, feed_dict={self.vu: x})
 
         return vp
+
+    def save(self, file_path='./rbm_model.ckpt'):
+        """Save model parameters to `file_path`
+
+        This function saves the current tensorflow session to a specified path.
+
+        Args:
+            file_path (str): output file path for the RBM model checkpoint
+                we will create a new directory if not existing.
+        """
+
+        f_path = Path(file_path)
+        dir_name, file_name = f_path.parent, f_path.name
+
+        # create the directory if it does not exist
+        os.makedirs(dir_name, exist_ok=True)
+
+        # save trained model
+        saver = tf.compat.v1.train.Saver()
+        saver.save(self.sess, os.path.join(dir_name, file_name))
+
+    def load(self, file_path='./rbm_model.ckpt'):
+        """Load model parameters for further use.
+
+        This function loads a saved tensorflow session.
+
+        Args:
+            file_path (str): file path for RBM model checkpoint
+        """
+
+        f_path = Path(file_path)
+        dir_name, file_name = f_path.parent, f_path.name
+
+        # load pre-trained model
+        saver = tf.compat.v1.train.Saver()
+        saver.restore(self.sess, os.path.join(dir_name, file_name))
diff --git a/tests/integration/examples/test_notebooks_pyspark.py b/tests/integration/examples/test_notebooks_pyspark.py
@@ -53,7 +53,7 @@ def test_mmlspark_lightgbm_criteo_integration(notebooks, output_notebook, kernel
         notebook_path,
         output_notebook,
         kernel_name=kernel_name,
-        parameters=dict(DATA_SIZE="full", NUM_ITERATIONS=50, EARLY_STOPPING_ROUND=10),
+        parameters=dict(DATA_SIZE="full", NUM_ITERATIONS=50),
     )
     results = sb.read_notebook(output_notebook).scraps.dataframe.set_index("name")[
         "data"

diff --git a/tests/smoke/examples/test_notebooks_pyspark.py b/tests/smoke/examples/test_notebooks_pyspark.py
@@ -53,7 +53,7 @@ def test_mmlspark_lightgbm_criteo_smoke(notebooks, output_notebook, kernel_name)
         notebook_path,
         output_notebook,
         kernel_name=kernel_name,
-        parameters=dict(DATA_SIZE="sample", NUM_ITERATIONS=50, EARLY_STOPPING_ROUND=10),
+        parameters=dict(DATA_SIZE="sample", NUM_ITERATIONS=50),
     )
 
     results = sb.read_notebook(output_notebook).scraps.dataframe.set_index("name")[

diff --git a/tests/unit/examples/test_notebooks_pyspark.py b/tests/unit/examples/test_notebooks_pyspark.py
@@ -132,5 +132,5 @@ def test_mmlspark_lightgbm_criteo_runs(notebooks, output_notebook, kernel_name):
         notebook_path,
         output_notebook,
         kernel_name=kernel_name,
-        parameters=dict(DATA_SIZE="sample", NUM_ITERATIONS=10, EARLY_STOPPING_ROUND=2),
+        parameters=dict(DATA_SIZE="sample", NUM_ITERATIONS=10),
     )
diff --git a/tests/unit/recommenders/models/test_rbm.py b/tests/unit/recommenders/models/test_rbm.py
@@ -13,6 +13,9 @@
 @pytest.fixture(scope="module")
 def init_rbm():
     return {
+        "n_users": 5000,
+        "possible_ratings": [1, 2, 3, 4, 5],
+        "n_visible": 500,
         "n_hidden": 100,
         "epochs": 10,
         "minibatch": 50,
@@ -27,6 +30,9 @@ def init_rbm():
 @pytest.mark.gpu
 def test_class_init(init_rbm):
     model = RBM(
+        n_users=init_rbm["n_users"],
+        possible_ratings=init_rbm["possible_ratings"],
+        visible_units=init_rbm["n_visible"],
         hidden_units=init_rbm["n_hidden"],
         training_epoch=init_rbm["epochs"],
         minibatch_size=init_rbm["minibatch"],
@@ -37,6 +43,12 @@ def test_class_init(init_rbm):
         display_epoch=init_rbm["display_epoch"],
     )
 
+    # number of users
+    assert model.n_users == init_rbm["n_users"]
+    # list of unique rating values
+    assert np.array_equal(model.possible_ratings, init_rbm["possible_ratings"])
+    # number of visible units
+    assert model.n_visible == init_rbm["n_visible"]
     # number of hidden units
     assert model.n_hidden == init_rbm["n_hidden"]
     # number of training epochs
@@ -58,16 +70,19 @@ def test_class_init(init_rbm):
 @pytest.mark.gpu
 def test_train_param_init(init_rbm, affinity_matrix):
     # obtain the train/test set matrices
-    Xtr, Xtst = affinity_matrix
+    Xtr, _ = affinity_matrix
 
     # initialize the model
     model = RBM(
+        n_users=Xtr.shape[0],
+        possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),
+        visible_units=Xtr.shape[1],
         hidden_units=init_rbm["n_hidden"],
         training_epoch=init_rbm["epochs"],
         minibatch_size=init_rbm["minibatch"],
     )
     # fit the model to the data
-    model.fit(Xtr, Xtst)
+    model.fit(Xtr)
 
     # visible units placeholder (tensor)
     model.vu.shape[1] == Xtr.shape[1]
@@ -82,10 +97,13 @@ def test_train_param_init(init_rbm, affinity_matrix):
 @pytest.mark.gpu
 def test_sampling_funct(init_rbm, affinity_matrix):
     # obtain the train/test set matrices
-    Xtr, Xtst = affinity_matrix
+    Xtr, _ = affinity_matrix
 
     # initialize the model
     model = RBM(
+        n_users=Xtr.shape[0],
+        possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),
+        visible_units=Xtr.shape[1],
         hidden_units=init_rbm["n_hidden"],
         training_epoch=init_rbm["epochs"],
         minibatch_size=init_rbm["minibatch"],
@@ -106,7 +124,7 @@ def check_sampled_values(sampled, s):
     r = Xtr.max()  # obtain the rating scale
 
     # fit the model to the data
-    model.fit(Xtr, Xtst)
+    model.fit(Xtr)
 
     # evaluate the activation probabilities of the hidden units and their sampled values
     phv, h = model.sess.run(model.sample_hidden_units(model.v))
@@ -132,3 +150,69 @@ def check_sampled_values(sampled, s):
 
     # check that the sampled values of the visible units is in [0,r]
     assert check_sampled_values(v_sampled, r).all()
+
+@pytest.mark.gpu
+def test_save_load(init_rbm, affinity_matrix):
+
+    # obtain the train/test set matrices
+    Xtr, _ = affinity_matrix
+
+    # initialize the model
+    original_model = RBM(
+        n_users=Xtr.shape[0],
+        possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),
+        visible_units=Xtr.shape[1],
+        hidden_units=init_rbm["n_hidden"],
+        training_epoch=init_rbm["epochs"],
+        minibatch_size=init_rbm["minibatch"],
+        keep_prob=init_rbm["keep_prob"],
+        learning_rate=init_rbm["learning_rate"],
+        init_stdv=init_rbm["init_stdv"],
+        sampling_protocol=init_rbm["sampling_protocol"],
+        display_epoch=init_rbm["display_epoch"],
+    )
+
+    # save the model
+    original_model.save()
+
+    # initialize another model
+    saved_model = RBM(
+        n_users=Xtr.shape[0],
+        possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),
+        visible_units=Xtr.shape[1],
+        hidden_units=init_rbm["n_hidden"],
+        training_epoch=init_rbm["epochs"],
+        minibatch_size=init_rbm["minibatch"],
+        keep_prob=init_rbm["keep_prob"],
+        learning_rate=init_rbm["learning_rate"],
+        init_stdv=init_rbm["init_stdv"],
+        sampling_protocol=init_rbm["sampling_protocol"],
+        display_epoch=init_rbm["display_epoch"],
+    )
+
+    # load the pretrained model
+    saved_model.load()
+
+    # number of users
+    assert saved_model.n_users == original_model.n_users
+    # list of unique rating values
+    assert np.array_equal(saved_model.possible_ratings, original_model.possible_ratings)
+    # number of visible units
+    assert saved_model.n_visible == original_model.n_visible
+    # number of hidden units
+    assert saved_model.n_hidden == original_model.n_hidden
+    # number of training epochs
+    assert saved_model.epochs == original_model.epochs
+    # minibatch size
+    assert saved_model.minibatch == original_model.minibatch
+    # keep probability for dropout regularization
+    assert saved_model.keep == original_model.keep
+    # learning rate
+    assert saved_model.learning_rate == original_model.learning_rate
+    # standard deviation used to initialize the weight matrix from a normal distribution
+    assert saved_model.stdv == original_model.stdv
+    # sampling protocol used to increase the number of steps in Gibbs sampling
+    assert saved_model.sampling_protocol == original_model.sampling_protocol
+    # number of epochs after which the rmse is displayed
+    assert saved_model.display_epoch == original_model.display_epoch
+