Skip to content

Commit

Permalink
use max length from model
Browse files Browse the repository at this point in the history
  • Loading branch information
edknv committed Dec 18, 2023
1 parent 191e1ee commit 4cfd24a
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 2 deletions.
5 changes: 3 additions & 2 deletions crossfit/op/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import os
from typing import Optional

import cudf
from cudf.core.subword_tokenizer import SubwordTokenizer, _cast_to_appropriate_type
Expand All @@ -32,11 +33,11 @@ def __init__(
cols=None,
keep_cols=None,
pre=None,
max_length: int = 1024,
max_length: Optional[int] = None,
):
super().__init__(pre=pre, cols=cols, keep_cols=keep_cols)
self.model = model
self.max_length = max_length
self.max_length = max_length or model.max_seq_length()

# Make sure we download the tokenizer just once
GPUTokenizer.from_pretrained(self.model)
Expand Down
1 change: 1 addition & 0 deletions tests/examples/test_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def test_script_execution(script):
# argv[0] will be replaced by runpy
sys.argv = [
"",
"--overwrite",
"--num-workers",
"1",
"--dataset",
Expand Down

0 comments on commit 4cfd24a

Please sign in to comment.