Skip to content

Commit

Permalink
try smaller partitions
Browse files Browse the repository at this point in the history
  • Loading branch information
edknv committed Dec 18, 2023
1 parent 4d1d822 commit 106f5e7
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 0 deletions.
7 changes: 7 additions & 0 deletions examples/beir_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ def parse_arguments():
)
parser.add_argument("--batch-size", type=int, default=64, help="Batch size")
parser.add_argument("--k", type=int, default=10, help="Nearest neighbors")
parser.add_argument(
"--partition-num",
type=int,
default=50_000,
help="Number of items to allocate to each partition",
)

args = parser.parse_args()
return args
Expand All @@ -39,6 +45,7 @@ def main():
overwrite=args.overwrite,
sorted_data_loader=args.sorted_dataloader,
batch_size=args.batch_size,
partition_num=args.partition_num,
)

report.console()
Expand Down
42 changes: 42 additions & 0 deletions examples/embed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import cudf
import dask_cudf

import crossfit as cf
from crossfit import op


def create_sample_ddf():
df = cudf.DataFrame(
{
"text": [
"query: how much protein should a female eat",
"query: summit define",
"passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
"passage: Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
]
}
)

npartitions = 2 # assume 2 GPUs and data is small enough to fit on 2 GPUs
df = dask_cudf.from_cudf(df, npartitions=npartitions)

return df


if __name__ == "__main__":
df = create_sample_ddf()

model = cf.SentenceTransformerModel("intfloat/e5-large-v2")

with cf.Distributed(rmm_pool_size="16GB", n_workers=2):
tokenizer = op.Tokenizer(model, cols=["text"])
tokens = tokenizer(df)

num_tokens = tokens.input_ids.map_partitions(
# work around `list_series.list.index(0)` not working by casting list values to int.
lambda s: s.list.astype(int).list.index(0).replace(-1, s.list.len().iloc[0]),
meta=("input_ids", "int"),
).to_frame()
num_tokens.to_parquet("temp_num_tokens.parquet")

print(dask_cudf.read_parquet("temp_num_tokens.parquet").compute())
2 changes: 2 additions & 0 deletions tests/examples/test_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def test_script_execution(script):
"12GB",
"--batch-size",
"8",
"--partition-num",
"10000",
]
runpy.run_path(
tmp_path,
Expand Down

0 comments on commit 106f5e7

Please sign in to comment.