try smaller partitions

rapidsai · Dec 18, 2023 · 106f5e7 · 106f5e7
1 parent 4d1d822
commit 106f5e7
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 0 deletions.
diff --git a/examples/beir_report.py b/examples/beir_report.py
@@ -20,6 +20,12 @@ def parse_arguments():
     )
     parser.add_argument("--batch-size", type=int, default=64, help="Batch size")
     parser.add_argument("--k", type=int, default=10, help="Nearest neighbors")
+    parser.add_argument(
+        "--partition-num",
+        type=int,
+        default=50_000,
+        help="Number of items to allocate to each partition",
+    )
 
     args = parser.parse_args()
     return args
@@ -39,6 +45,7 @@ def main():
             overwrite=args.overwrite,
             sorted_data_loader=args.sorted_dataloader,
             batch_size=args.batch_size,
+            partition_num=args.partition_num,
         )
 
     report.console()

diff --git a/examples/embed.py b/examples/embed.py
@@ -0,0 +1,42 @@
+import cudf
+import dask_cudf
+
+import crossfit as cf
+from crossfit import op
+
+
+def create_sample_ddf():
+    df = cudf.DataFrame(
+        {
+            "text": [
+                "query: how much protein should a female eat",
+                "query: summit define",
+                "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
+                "passage: Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
+            ]
+        }
+    )
+
+    npartitions = 2  # assume 2 GPUs and data is small enough to fit on 2 GPUs
+    df = dask_cudf.from_cudf(df, npartitions=npartitions)
+
+    return df
+
+
+if __name__ == "__main__":
+    df = create_sample_ddf()
+
+    model = cf.SentenceTransformerModel("intfloat/e5-large-v2")
+
+    with cf.Distributed(rmm_pool_size="16GB", n_workers=2):
+        tokenizer = op.Tokenizer(model, cols=["text"])
+        tokens = tokenizer(df)
+
+        num_tokens = tokens.input_ids.map_partitions(
+            # work around `list_series.list.index(0)` not working by casting list values to int.
+            lambda s: s.list.astype(int).list.index(0).replace(-1, s.list.len().iloc[0]),
+            meta=("input_ids", "int"),
+        ).to_frame()
+        num_tokens.to_parquet("temp_num_tokens.parquet")
+
+    print(dask_cudf.read_parquet("temp_num_tokens.parquet").compute())
diff --git a/tests/examples/test_scripts.py b/tests/examples/test_scripts.py
@@ -35,6 +35,8 @@ def test_script_execution(script):
             "12GB",
             "--batch-size",
             "8",
+            "--partition-num",
+            "10000",
         ]
         runpy.run_path(
             tmp_path,