Skip to content

Commit

Permalink
Merge pull request #28 from deepray-AI/recommendation
Browse files Browse the repository at this point in the history
init_checkpoint
  • Loading branch information
fuhailin authored Oct 18, 2023
2 parents c7024ed + b32d0cf commit 9e892f8
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 35 deletions.
5 changes: 3 additions & 2 deletions deepray/core/base_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,8 +611,9 @@ def fit(
if FLAGS.init_checkpoint:
for (name, ckpt), init_ckpt in zip(self._checkpoints.items(), FLAGS.init_checkpoint):
if init_ckpt:
logging.info(f'Checkpoint file {init_ckpt} found and restoring from initial checkpoint for {name} model.')
ckpt.restore(init_ckpt).assert_existing_objects_matched()
latest_checkpoint = tf.train.latest_checkpoint(init_ckpt)
logging.info(f'Checkpoint file {latest_checkpoint} found and restoring from initial checkpoint for {name} model.')
ckpt.restore(latest_checkpoint).assert_existing_objects_matched()
logging.info('Loading from checkpoint file completed')

if FLAGS.init_weights:
Expand Down
2 changes: 1 addition & 1 deletion deepray/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# We follow Semantic Versioning (https://semver.org/)
_MAJOR_VERSION = "0"
_MINOR_VERSION = "21"
_PATCH_VERSION = "4"
_PATCH_VERSION = "5"

# When building releases, we can update this value on the release branch to
# reflect the current release candidate ('rc0', 'rc1') or, finally, the official
Expand Down
40 changes: 25 additions & 15 deletions modelzoo/Recommendation/criteo_ctr/dcn_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,20 +62,28 @@ def __init__(self, interaction, training=False, *args, **kwargs):
)

def build(self, input_shape):
self._embedding_layer = {}
for name, dim, dtype in self.feature_map[(self.feature_map['ftype'] == "Categorical")][["name", "dim",
"dtype"]].values:
self._embedding_layer[name] = DistributedDynamicEmbedding(
embedding_dim=dim,
key_dtype=dtype,
value_dtype=tf.float32,
initializer=None if self.training else tf.keras.initializers.Zeros(),
name='emb' + name,
de_option=DynamicEmbeddingOption(
device="DRAM",
init_capacity=1 * 1024 * 1024,
),
)
self.embedding_layer = {}
for name, dim, dtype, voc_size in self.feature_map[(self.feature_map['ftype'] == "Categorical")][[
"name", "dim", "dtype", "voc_size"
]].values:
if voc_size and not FLAGS.use_dynamic_embedding:
self.embedding_layer[name] = tf.keras.layers.Embedding(
input_dim=voc_size,
output_dim=dim,
embeddings_initializer="uniform",
)
else:
self.embedding_layer[name] = DistributedDynamicEmbedding(
embedding_dim=dim,
key_dtype=dtype,
value_dtype=tf.float32,
initializer=None if self.training else tf.keras.initializers.Zeros(),
name='emb' + name,
de_option=DynamicEmbeddingOption(
device="HBM",
init_capacity=1 * 1024 * 1024,
),
)

def call(self, inputs: Dict[str, tf.Tensor], training=None, mask=None) -> tf.Tensor:
"""Executes forward and backward pass, returns loss.
Expand All @@ -91,7 +99,9 @@ def call(self, inputs: Dict[str, tf.Tensor], training=None, mask=None) -> tf.Ten
for name, dim in self.feature_map[(self.feature_map['ftype'] == "Categorical")][["name", "dim"]].values:
tensor = inputs[name]
# (batch_size, emb).
sparse_embedding_vecs.append(self._embedding_layer[name](tensor))
test = self.embedding_layer[name](tensor)
# print(test)
sparse_embedding_vecs.append(test)

dense_embedding_vec = self._bottom_stack(dense_features)

Expand Down
39 changes: 25 additions & 14 deletions modelzoo/Recommendation/criteo_ctr/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from deepray.datasets.criteo.criteo_tsv_reader import CriteoTsvReader
from deepray.utils.benchmark import PerformanceCalculator
from deepray.utils.export import SavedModel
from dcn_v2 import Ranking

FLAGS = flags.FLAGS

Expand All @@ -21,24 +22,25 @@ def runner(argv=None):
argv = [
sys.argv[0],
"--batch_size=4096",
"--run_eagerly=True",
"--run_eagerly=false",
"--use_dynamic_embedding=True",
f"--feature_map={dir_path}/feature_map_small.csv",
"--model_dir=/workspaces/tmp/export_main_optimized/",
"--model_dir=/results/tf_tfra_training_criteo_dcn_fp32_gbs16384_231017142132/export_main",
]
if argv:
FLAGS(argv, known_only=True)

data_pipe = CriteoTsvReader(use_synthetic_data=True)
# create data pipline of train & test dataset
train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
model = SavedModel(FLAGS.model_dir, "amp" if FLAGS.dtype else "fp32")
signature = model.saved_model_loaded.signatures['serving_default']
print(signature)

_performance_calculator = PerformanceCalculator(0, 1000)
num_examples = 0
step = 0
mode = "123"
if mode == "sm_predict":
# TODO: bugfix
model = SavedModel(FLAGS.model_dir, "amp" if FLAGS.dtype else "fp32")
else:
model = Ranking(interaction="cross", training=False)
model.load_weights(os.path.join(FLAGS.model_dir, "variables/variables"))

a = {
"feature_14":
Expand All @@ -57,16 +59,25 @@ def runner(argv=None):
)
}

b = {
"feature_14": tf.constant(np.array([1]), dtype=tf.int32),
"feature_15": tf.constant(np.array([1]), dtype=tf.int32),
"dense_features": tf.constant(np.array([[1.0, 1.0]]), dtype=tf.float32)
}
# b = {
# "feature_14": tf.constant(np.array([1]), dtype=tf.int32),
# "feature_15": tf.constant(np.array([1]), dtype=tf.int32),
# "dense_features": tf.constant(np.array([[1.0, 1.0]]), dtype=tf.float32)
# }

print(model(a))
print(model(b))

for name in ["feature_14", "feature_15"]:
tensor = a[name]
test = model.embedding_layer[name](tensor)
print(test)
# print(model(b))
exit(0)

_performance_calculator = PerformanceCalculator(0, 1000)
num_examples = 0
step = 0

for x, y in train_dataset.take(300):
preds = model(x)
step += 1
Expand Down
6 changes: 4 additions & 2 deletions modelzoo/Recommendation/criteo_ctr/run_horovod.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,16 +65,18 @@ else
nsys_command=""
fi


set -x
$hvd_command $nsys_command python train.py \
--feature_map=feature_map_small.csv \
--num_gpus=$num_gpu \
--batch_size=$batch_size \
--use_dynamic_embedding=True \
--steps_per_summary=50 \
--steps_per_summary=10 \
--run_eagerly=false \
--save_checkpoint_steps=200 \
--steps_per_epoch=200 \
--init_checkpoint=/results/tf_tfra_training_criteo_dcn_fp32_gbs4096_231018021802/ckpt_main_model/ \
--stop_steps=600 \
--learning_rate=$learning_rate \
--epochs=$epochs \
--model_dir=${RESULTS_DIR} \
Expand Down
9 changes: 8 additions & 1 deletion modelzoo/Recommendation/criteo_ctr/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,14 @@ def main(_):
dtype=tf.float32
)
}
print(trainer.model(a))

logging.info(model(a))
logging.info(trainer.model(a))

for name in ["feature_14", "feature_15"]:
tensor = a[name]
test = model.embedding_layer[name](tensor)
print(test)

export_to_savedmodel(trainer.model)

Expand Down

0 comments on commit 9e892f8

Please sign in to comment.