Skip to content

Commit

Permalink
tokenizer_naem, not path
Browse files Browse the repository at this point in the history
  • Loading branch information
thesofakillers committed Jan 16, 2023
1 parent 3d1413c commit b5ab48e
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 8 deletions.
2 changes: 0 additions & 2 deletions claficle/conf/model/base_gewechselt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,3 @@ defaults:

source_lang: "en"
target_lang: ???

tokenizer_path: null # path to save to/load from for trained target lang tokenizer
2 changes: 1 addition & 1 deletion claficle/conf/setup_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ seed: 1

disable_wandb: False
lang: null
tokenizer_path: null
tokenizer_name: null # name of tokenizer in checkpoints/tokenizers/

# to avoid hydra creating output dirs: https://stackoverflow.com/a/64635492/9889508
hydra:
Expand Down
6 changes: 3 additions & 3 deletions claficle/data/oscar.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,14 @@ def main(cfg: DictConfig):
oscar.prepare_data()

# optionally, load the tokenizer and perform tokenization
if cfg.tokenizer_path is not None:
if cfg.tokenizer_name is not None:
tokenizer = transformers.AutoTokenizer.from_pretrained(
os.path.join("checkpoints", "tokenizers", cfg.tokenizer_path)
os.path.join("checkpoints", "tokenizers", cfg.tokenizer_name)
)
oscar.set_tokenizer(tokenizer)
oscar.setup()
# for english, we can always do the tokenization
elif cfg.tokenizer_path is None and cfg.lang == "en":
elif cfg.tokenizer_name is None and cfg.lang == "en":
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2-large")
oscar.set_tokenizer(tokenizer)
oscar.setup()
Expand Down
4 changes: 2 additions & 2 deletions slurm/data/oscar/setup.array.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
lang=fr tokenizer_path=tokenizer_fr
lang=de tokenizer_path=tokenizer_de
lang=fr tokenizer_name=fr_gewechselt
lang=de tokenizer_name=de_gewechselt

0 comments on commit b5ab48e

Please sign in to comment.