Update example notebooks to create schema object from merlin core (#650)

* create schema from core * update ETL and read schema from core * read schema from core * update nbs read schema from core
NVIDIA-Merlin · Mar 17, 2023 · a293d8c · a293d8c
1 parent 8c13823
commit a293d8c
Show file tree

Hide file tree

Showing 11 changed files with 3,037 additions and 3,129 deletions.
diff --git a/examples/end-to-end-session-based/01-ETL-with-NVTabular.ipynb b/examples/end-to-end-session-based/01-ETL-with-NVTabular.ipynb
diff --git a/examples/end-to-end-session-based/02-End-to-end-session-based-with-Yoochoose-PyT.ipynb b/examples/end-to-end-session-based/02-End-to-end-session-based-with-Yoochoose-PyT.ipynb
diff --git a/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb b/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb
@@ -119,6 +119,8 @@
     "from transformers4rec import torch as tr\n",
     "from transformers4rec.torch.ranking_metric import NDCGAt, AvgPrecisionAt, RecallAt\n",
     "from transformers4rec.torch.utils.examples_utils import wipe_memory\n",
+    "from merlin.schema import Schema\n",
+    "from merlin.io import Dataset\n",
     "\n",
     "import argparse\n",
     "\n",
@@ -131,14 +133,15 @@
     "parser.add_argument('--per-device-eval-batch-size', type=int, default=512, help='Per device batch size for evaluation')\n",
     "sh_args = parser.parse_args()\n",
     "\n",
-    "# create the schema object by reading the schema.pbtxt file generated by NVTabular pipeline in the previous 01-ETL-with-NVTabular notebook\n",
-    "from merlin_standard_lib import Schema\n",
-    "SCHEMA_PATH = \"schema_demo.pb\"\n",
-    "schema = Schema().from_proto_text(SCHEMA_PATH)\n",
+    "# create the schema object by reading the processed train set generated in the previous 01-ETL-with-NVTabular notebook\n",
+    "\n",
+    "INPUT_DATA_DIR = os.environ.get(\"INPUT_DATA_DIR\", \"/workspace/data\")\n",
+    "train = Dataset(os.path.join(INPUT_DATA_DIR, \"processed_nvt/part_0.parquet\"))\n",
+    "schema = train.schema\n",
     "\n",
     "# select the subset of features we want to use for training the model by their tags or their names.\n",
     "schema = schema.select_by_name(\n",
-    "   ['item_id-list_seq', 'category-list_seq', 'product_recency_days_log_norm-list_seq', 'et_dayofweek_sin-list_seq']\n",
+    "   ['item_id-list', 'category-list', 'product_recency_days_log_norm-list', 'et_dayofweek_sin-list']\n",
     ")\n",
     "\n",
     "max_sequence_length, d_model = 20, 320\n",
@@ -256,7 +259,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!python -m torch.distributed.launch --nproc_per_node 2 pyt_trainer.py --path \"./preproc_sessions_by_day\" --learning-rate 0.0005"
+    "!python -m torch.distributed.launch --nproc_per_node 2 pyt_trainer.py --path \"/workspace/data/preproc_sessions_by_day\" --learning-rate 0.0005"
    ]
   },
   {
@@ -284,8 +287,7 @@
     "\n",
     "- Merlin Transformers4rec: https://github.com/NVIDIA-Merlin/Transformers4Rec\n",
     "\n",
-    "- Merlin NVTabular: https://github.com/NVIDIA-Merlin/NVTabular/tree/main/nvtabular",
-    "\n",
+    "- Merlin NVTabular: https://github.com/NVIDIA-Merlin/NVTabular/tree/main/nvtabular\n",
     "- Merlin Dataloader: https://github.com/NVIDIA-Merlin/dataloader"
    ]
   }

diff --git a/examples/end-to-end-session-based/schema_demo.pb b/examples/end-to-end-session-based/schema_demo.pb