From 382a6d8d0db9d70f04b713b982827f0d67dbaa65 Mon Sep 17 00:00:00 2001
From: edknv <109497216+edknv@users.noreply.github.com>
Date: Wed, 17 Jan 2024 08:41:33 -0800
Subject: [PATCH] update custom pytorch example to use deberta (#44)

* update custom pytorch example

* comment out test

* lint

* lint
---
 examples/custom_pytorch_model.py | 37 +++++++++++++++++++-------------
 tests/examples/test_scripts.py   |  4 ++++
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/examples/custom_pytorch_model.py b/examples/custom_pytorch_model.py
index 68f32813..5e36364e 100644
--- a/examples/custom_pytorch_model.py
+++ b/examples/custom_pytorch_model.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+from dataclasses import dataclass
 
 import dask_cudf
 import torch
@@ -14,27 +15,28 @@
 NUM_ROWS = 1_000
 
 
-class CFG:
-    model = "sentence-transformers/all-MiniLM-L6-v2"
+@dataclass
+class Config:
+    model = "microsoft/deberta-v3-base"
     fc_dropout = 0.2
     max_len = 512
     out_dim = 3
 
 
 class CustomModel(nn.Module):
-    def __init__(self, cfg, config_path=None, pretrained=False):
+    def __init__(self, config, config_path=None, pretrained=False):
         super().__init__()
-        self.cfg = cfg
+        self.config = config
         if config_path is None:
-            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
+            self.config = AutoConfig.from_pretrained(config.model, output_hidden_states=True)
         else:
             self.config = torch.load(config_path)
         if pretrained:
-            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
+            self.model = AutoModel.from_pretrained(config.model, config=self.config)
         else:
             self.model = AutoModel(self.config)
-        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
-        self.fc = nn.Linear(self.config.hidden_size, self.cfg.out_dim)
+        self.fc_dropout = nn.Dropout(config.fc_dropout)
+        self.fc = nn.Linear(self.config.hidden_size, config.out_dim)
         self._init_weights(self.fc)
 
     def _init_weights(self, module):
@@ -63,8 +65,8 @@ def forward(self, batch):
 
 
 # The user must provide a load_model function
-def load_model(cfg, device, model_path):
-    model = CustomModel(cfg, config_path=None, pretrained=True)
+def load_model(config, device, model_path):
+    model = CustomModel(config, config_path=None, pretrained=True)
     model = model.to(device)
 
     if os.path.exists(model_path):
@@ -77,10 +79,14 @@ def load_model(cfg, device, model_path):
 
 
 class MyModel(HFModel):
-    def load_model(self, device="cuda"):
-        return load_model(CFG, device=device, model_path=self.path_or_name)
+    def __init__(self, config):
+        self.config = config
+        super().__init__(self.config.model)
 
-    def load_cfg(self):
+    def load_model(self, model_path=None, device="cuda"):
+        return load_model(self.config, device=device, model_path=model_path or self.path_or_name)
+
+    def load_config(self):
         return AutoConfig.from_pretrained(self.path_or_name)
 
 
@@ -114,12 +120,13 @@ def main():
     labels = ["foo", "bar", "baz"]
 
     with cf.Distributed(rmm_pool_size=args.pool_size, n_workers=args.num_workers):
-        model = MyModel(CFG.model)
+        model = MyModel(Config)
         pipe = op.Sequential(
-            op.Tokenizer(model, cols=[args.input_column]),
+            op.Tokenizer(model, cols=[args.input_column], tokenizer_type="sentencepiece"),
             op.Predictor(model, sorted_data_loader=True, batch_size=args.batch_size),
             op.Labeler(labels, cols=["preds"]),
             repartition=args.partitions,
+            keep_cols=[args.input_column],
         )
         outputs = pipe(ddf)
         outputs.to_parquet(args.output_parquet_path)
diff --git a/tests/examples/test_scripts.py b/tests/examples/test_scripts.py
index cb9393c7..d4d609fe 100644
--- a/tests/examples/test_scripts.py
+++ b/tests/examples/test_scripts.py
@@ -8,6 +8,10 @@
 import sys  # noqa: E402
 import tempfile  # noqa: E402
 
+# from uuid import uuid4  # noqa: E402
+
+# from crossfit.dataset.load import load_dataset  # noqa: E402
+
 examples_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "examples")