Merge pull request #28 from deepray-AI/recommendation

init_checkpoint
deepray-AI · Oct 18, 2023 · 9e892f8 · 9e892f8
2 parents c7024ed + b32d0cf
commit 9e892f8
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 35 deletions.
diff --git a/deepray/core/base_trainer.py b/deepray/core/base_trainer.py
@@ -611,8 +611,9 @@ def fit(
       if FLAGS.init_checkpoint:
         for (name, ckpt), init_ckpt in zip(self._checkpoints.items(), FLAGS.init_checkpoint):
           if init_ckpt:
-            logging.info(f'Checkpoint file {init_ckpt} found and restoring from initial checkpoint for {name} model.')
-            ckpt.restore(init_ckpt).assert_existing_objects_matched()
+            latest_checkpoint = tf.train.latest_checkpoint(init_ckpt)
+            logging.info(f'Checkpoint file {latest_checkpoint} found and restoring from initial checkpoint for {name} model.')
+            ckpt.restore(latest_checkpoint).assert_existing_objects_matched()
             logging.info('Loading from checkpoint file completed')
 
       if FLAGS.init_weights:

diff --git a/deepray/version.py b/deepray/version.py
@@ -21,7 +21,7 @@
 # We follow Semantic Versioning (https://semver.org/)
 _MAJOR_VERSION = "0"
 _MINOR_VERSION = "21"
-_PATCH_VERSION = "4"
+_PATCH_VERSION = "5"
 
 # When building releases, we can update this value on the release branch to
 # reflect the current release candidate ('rc0', 'rc1') or, finally, the official

diff --git a/modelzoo/Recommendation/criteo_ctr/dcn_v2.py b/modelzoo/Recommendation/criteo_ctr/dcn_v2.py
@@ -62,20 +62,28 @@ def __init__(self, interaction, training=False, *args, **kwargs):
       )
 
   def build(self, input_shape):
-    self._embedding_layer = {}
-    for name, dim, dtype in self.feature_map[(self.feature_map['ftype'] == "Categorical")][["name", "dim",
-                                                                                            "dtype"]].values:
-      self._embedding_layer[name] = DistributedDynamicEmbedding(
-          embedding_dim=dim,
-          key_dtype=dtype,
-          value_dtype=tf.float32,
-          initializer=None if self.training else tf.keras.initializers.Zeros(),
-          name='emb' + name,
-          de_option=DynamicEmbeddingOption(
-              device="DRAM",
-              init_capacity=1 * 1024 * 1024,
-          ),
-      )
+    self.embedding_layer = {}
+    for name, dim, dtype, voc_size in self.feature_map[(self.feature_map['ftype'] == "Categorical")][[
+        "name", "dim", "dtype", "voc_size"
+    ]].values:
+      if voc_size and not FLAGS.use_dynamic_embedding:
+        self.embedding_layer[name] = tf.keras.layers.Embedding(
+            input_dim=voc_size,
+            output_dim=dim,
+            embeddings_initializer="uniform",
+        )
+      else:
+        self.embedding_layer[name] = DistributedDynamicEmbedding(
+            embedding_dim=dim,
+            key_dtype=dtype,
+            value_dtype=tf.float32,
+            initializer=None if self.training else tf.keras.initializers.Zeros(),
+            name='emb' + name,
+            de_option=DynamicEmbeddingOption(
+                device="HBM",
+                init_capacity=1 * 1024 * 1024,
+            ),
+        )
 
   def call(self, inputs: Dict[str, tf.Tensor], training=None, mask=None) -> tf.Tensor:
     """Executes forward and backward pass, returns loss.
@@ -91,7 +99,9 @@ def call(self, inputs: Dict[str, tf.Tensor], training=None, mask=None) -> tf.Ten
     for name, dim in self.feature_map[(self.feature_map['ftype'] == "Categorical")][["name", "dim"]].values:
       tensor = inputs[name]
       # (batch_size, emb).
-      sparse_embedding_vecs.append(self._embedding_layer[name](tensor))
+      test = self.embedding_layer[name](tensor)
+      # print(test)
+      sparse_embedding_vecs.append(test)
 
     dense_embedding_vec = self._bottom_stack(dense_features)
 

diff --git a/modelzoo/Recommendation/criteo_ctr/infer.py b/modelzoo/Recommendation/criteo_ctr/infer.py
@@ -11,6 +11,7 @@
 from deepray.datasets.criteo.criteo_tsv_reader import CriteoTsvReader
 from deepray.utils.benchmark import PerformanceCalculator
 from deepray.utils.export import SavedModel
+from dcn_v2 import Ranking
 
 FLAGS = flags.FLAGS
 
@@ -21,24 +22,25 @@ def runner(argv=None):
     argv = [
         sys.argv[0],
         "--batch_size=4096",
-        "--run_eagerly=True",
+        "--run_eagerly=false",
         "--use_dynamic_embedding=True",
         f"--feature_map={dir_path}/feature_map_small.csv",
-        "--model_dir=/workspaces/tmp/export_main_optimized/",
+        "--model_dir=/results/tf_tfra_training_criteo_dcn_fp32_gbs16384_231017142132/export_main",
     ]
   if argv:
     FLAGS(argv, known_only=True)
 
   data_pipe = CriteoTsvReader(use_synthetic_data=True)
   # create data pipline of train & test dataset
   train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  model = SavedModel(FLAGS.model_dir, "amp" if FLAGS.dtype else "fp32")
-  signature = model.saved_model_loaded.signatures['serving_default']
-  print(signature)
 
-  _performance_calculator = PerformanceCalculator(0, 1000)
-  num_examples = 0
-  step = 0
+  mode = "123"
+  if mode == "sm_predict":
+    # TODO: bugfix
+    model = SavedModel(FLAGS.model_dir, "amp" if FLAGS.dtype else "fp32")
+  else:
+    model = Ranking(interaction="cross", training=False)
+    model.load_weights(os.path.join(FLAGS.model_dir, "variables/variables"))
 
   a = {
       "feature_14":
@@ -57,16 +59,25 @@ def runner(argv=None):
           )
   }
 
-  b = {
-      "feature_14": tf.constant(np.array([1]), dtype=tf.int32),
-      "feature_15": tf.constant(np.array([1]), dtype=tf.int32),
-      "dense_features": tf.constant(np.array([[1.0, 1.0]]), dtype=tf.float32)
-  }
+  # b = {
+  #     "feature_14": tf.constant(np.array([1]), dtype=tf.int32),
+  #     "feature_15": tf.constant(np.array([1]), dtype=tf.int32),
+  #     "dense_features": tf.constant(np.array([[1.0, 1.0]]), dtype=tf.float32)
+  # }
 
   print(model(a))
-  print(model(b))
+
+  for name in ["feature_14", "feature_15"]:
+    tensor = a[name]
+    test = model.embedding_layer[name](tensor)
+    print(test)
+  # print(model(b))
   exit(0)
 
+  _performance_calculator = PerformanceCalculator(0, 1000)
+  num_examples = 0
+  step = 0
+
   for x, y in train_dataset.take(300):
     preds = model(x)
     step += 1

diff --git a/modelzoo/Recommendation/criteo_ctr/run_horovod.sh b/modelzoo/Recommendation/criteo_ctr/run_horovod.sh
@@ -65,16 +65,18 @@ else
     nsys_command=""
 fi
 
+
 set -x
 $hvd_command $nsys_command python train.py \
     --feature_map=feature_map_small.csv \
     --num_gpus=$num_gpu \
     --batch_size=$batch_size \
     --use_dynamic_embedding=True \
-    --steps_per_summary=50 \
+    --steps_per_summary=10 \
     --run_eagerly=false \
     --save_checkpoint_steps=200 \
-    --steps_per_epoch=200 \
+    --init_checkpoint=/results/tf_tfra_training_criteo_dcn_fp32_gbs4096_231018021802/ckpt_main_model/ \
+    --stop_steps=600 \
     --learning_rate=$learning_rate \
     --epochs=$epochs \
     --model_dir=${RESULTS_DIR} \

diff --git a/modelzoo/Recommendation/criteo_ctr/train.py b/modelzoo/Recommendation/criteo_ctr/train.py
@@ -60,7 +60,14 @@ def main(_):
               dtype=tf.float32
           )
   }
-  print(trainer.model(a))
+
+  logging.info(model(a))
+  logging.info(trainer.model(a))
+
+  for name in ["feature_14", "feature_15"]:
+    tensor = a[name]
+    test = model.embedding_layer[name](tensor)
+    print(test)
 
   export_to_savedmodel(trainer.model)