diff --git a/examples/pytorch/language-modeling/README.md b/examples/pytorch/language-modeling/README.md index 3069fe9eb974c1..b13cebde5f5796 100644 --- a/examples/pytorch/language-modeling/README.md +++ b/examples/pytorch/language-modeling/README.md @@ -36,7 +36,7 @@ the tokenization). The loss here is that of causal language modeling. ```bash python run_clm.py \ - --model_name_or_path gpt2 \ + --model_name_or_path openai-community/gpt2 \ --dataset_name wikitext \ --dataset_config_name wikitext-2-raw-v1 \ --per_device_train_batch_size 8 \ @@ -53,7 +53,7 @@ To run on your own training and validation files, use the following command: ```bash python run_clm.py \ - --model_name_or_path gpt2 \ + --model_name_or_path openai-community/gpt2 \ --train_file path_to_train_file \ --validation_file path_to_validation_file \ --per_device_train_batch_size 8 \ @@ -67,12 +67,63 @@ This uses the built in HuggingFace `Trainer` for training. If you want to use a ```bash python run_clm_no_trainer.py \ + --dataset_name wikitext \ + --dataset_config_name wikitext-2-raw-v1 \ + --model_name_or_path openai-community/gpt2 \ + --output_dir /tmp/test-clm +``` + +### GPT-2/GPT and causal language modeling with fill-in-the middle objective + +The following example fine-tunes GPT-2 on WikiText-2 but using the Fill-in-middle training objective. FIM objective was proposed in [Efficient Training of Language Models to Fill in the Middle](https://arxiv.org/abs/2207.14255). They showed that autoregressive language models can learn to infill text after applying a straightforward transformation to the dataset, which simply moves a span of text from the middle of a document to its end. + +We're using the raw WikiText-2 (no tokens were replaced before the tokenization). The loss here is that of causal language modeling. + +```bash +python run_fim.py \ + --model_name_or_path gpt2 \ + --dataset_name wikitext \ + --dataset_config_name wikitext-2-raw-v1 \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 8 \ + --fim_rate 0.5 \ + --fim_spm_rate 0.2 \ + --do_train \ + --do_eval \ + --output_dir /tmp/test-clm +``` + +To run on your own training and validation files, use the following command: + +```bash +python run_fim.py \ + --model_name_or_path gpt2 \ + --train_file path_to_train_file \ + --validation_file path_to_validation_file \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 8 \ + --fim_rate 0.5 \ + --fim_spm_rate 0.2 \ + --do_train \ + --do_eval \ + --output_dir /tmp/test-clm +``` + +This uses the built in HuggingFace `Trainer` for training. If you want to use a custom training loop, you can utilize or adapt the `run_fim_no_trainer.py` script. Take a look at the script for a list of supported arguments. An example is shown below: + +```bash +python run_fim_no_trainer.py \ + --model_name_or_path gpt2 \ --dataset_name wikitext \ --dataset_config_name wikitext-2-raw-v1 \ --model_name_or_path gpt2 \ + --fim_rate 0.5 \ + --fim_spm_rate 0.2 \ --output_dir /tmp/test-clm ``` +**Note**: Passing in FIM rate as `0.5` means that FIM transformations will be applied to the dataset with a probability of 50%. Whereas passing in FIM SPM rate as `0.2` means that 20% of FIM transformations will use SPM (or Suffix-Prefix-Middle) and the remaining 80% will use PSM (or Prefix-Suffix-Middle) mode of transformation. + ### RoBERTa/BERT/DistilBERT and masked language modeling The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different @@ -84,7 +135,7 @@ converge slightly slower (over-fitting takes more epochs). ```bash python run_mlm.py \ - --model_name_or_path roberta-base \ + --model_name_or_path FacebookAI/roberta-base \ --dataset_name wikitext \ --dataset_config_name wikitext-2-raw-v1 \ --per_device_train_batch_size 8 \ @@ -98,7 +149,7 @@ To run on your own training and validation files, use the following command: ```bash python run_mlm.py \ - --model_name_or_path roberta-base \ + --model_name_or_path FacebookAI/roberta-base \ --train_file path_to_train_file \ --validation_file path_to_validation_file \ --per_device_train_batch_size 8 \ @@ -117,7 +168,7 @@ This uses the built in HuggingFace `Trainer` for training. If you want to use a python run_mlm_no_trainer.py \ --dataset_name wikitext \ --dataset_config_name wikitext-2-raw-v1 \ - --model_name_or_path roberta-base \ + --model_name_or_path FacebookAI/roberta-base \ --output_dir /tmp/test-mlm ``` @@ -144,7 +195,7 @@ Here is how to fine-tune XLNet on wikitext-2: ```bash python run_plm.py \ - --model_name_or_path=xlnet-base-cased \ + --model_name_or_path=xlnet/xlnet-base-cased \ --dataset_name wikitext \ --dataset_config_name wikitext-2-raw-v1 \ --per_device_train_batch_size 8 \ @@ -158,7 +209,7 @@ To fine-tune it on your own training and validation file, run: ```bash python run_plm.py \ - --model_name_or_path=xlnet-base-cased \ + --model_name_or_path=xlnet/xlnet-base-cased \ --train_file path_to_train_file \ --validation_file path_to_validation_file \ --per_device_train_batch_size 8 \ @@ -176,11 +227,11 @@ sure all your batches have the same length. ## Streaming -To use the streaming dataset mode which can be very useful for large datasets, add `--streaming` to the command line. This is currently supported by `run_mlm.py` and `run_clm.py`. +To use the streaming dataset mode which can be very useful for large datasets, add `--streaming` to the command line. This is supported by `run_mlm.py`, `run_clm.py` and `run_fim.py`. Make sure to adapt the other scripts to your use case by taking inspiration from them. ## Low Cpu Memory Usage -To use low cpu memory mode which can be very useful for LLM, add `--low_cpu_mem_usage` to the command line. This is currently supported by `run_clm.py`,`run_mlm.py`, `run_plm.py`,`run_mlm_no_trainer.py` and `run_clm_no_trainer.py`. +To use low cpu memory mode which can be very useful for LLM, add `--low_cpu_mem_usage` to the command line. This is currently supported by `run_clm.py`,`run_mlm.py`, `run_plm.py`, `run_fim.py`, `run_mlm_no_trainer.py`, `run_clm_no_trainer.py` and `run_fim_no_trainer.py`. ## Creating a model on the fly @@ -188,8 +239,8 @@ When training a model from scratch, configuration values may be overridden with ```bash -python run_clm.py --model_type gpt2 --tokenizer_name gpt2 \ --config_overrides="n_embd=1024,n_head=16,n_layer=48,n_positions=102" \ +python run_clm.py --model_type gpt2 --tokenizer_name openai-community/gpt2 \ --config_overrides="n_embd=1024,n_head=16,n_layer=48,n_positions=102" \ [...] ``` -This feature is only available in `run_clm.py`, `run_plm.py` and `run_mlm.py`. +This feature is only available in `run_clm.py`, `run_plm.py`, `run_mlm.py` and `run_fim.py`. diff --git a/examples/pytorch/language-modeling/requirements.txt b/examples/pytorch/language-modeling/requirements.txt index 19c487fe3f6312..851e8de09ccdc1 100644 --- a/examples/pytorch/language-modeling/requirements.txt +++ b/examples/pytorch/language-modeling/requirements.txt @@ -1,6 +1,6 @@ accelerate >= 0.12.0 torch >= 1.3 -datasets >= 1.8.0 +datasets >= 2.14.0 sentencepiece != 0.1.92 protobuf evaluate diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 0637f34d58bcaa..d8a88f97881be5 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -25,7 +25,6 @@ import math import os import sys -import time from dataclasses import dataclass, field from itertools import chain from typing import Optional @@ -35,13 +34,22 @@ # Initialize MLFlow import mlflow import torch -import transformers from datasets import load_dataset -from transformers import (CONFIG_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING, - AutoConfig, AutoModelForCausalLM, AutoTokenizer, - HfArgumentParser, Trainer, TrainingArguments, - default_data_collator, is_torch_tpu_available, - set_seed) + +import transformers +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + Trainer, + TrainingArguments, + default_data_collator, + is_torch_xla_available, + set_seed, +) from transformers.testing_utils import CaptureLogger from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry @@ -52,15 +60,13 @@ from utils.utils import get_num_parameters # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0") +check_min_version("4.44.0") -require_version( - "datasets>=1.8.0", - "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt" -) +require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") logger = logging.getLogger(__name__) + MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) @@ -74,91 +80,83 @@ class ModelArguments: model_name_or_path: Optional[str] = field( default="microsoft/prophetnet-large-uncased", metadata={ - "help": - ("The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." - ) + "help": ( + "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch." + ) }, ) model_type: Optional[str] = field( default=None, - metadata={ - "help": - "If training from scratch, pass a model type from the list: " + - ", ".join(MODEL_TYPES) - }, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, ) config_overrides: Optional[str] = field( default=None, metadata={ - "help": - ("Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" - ) + "help": ( + "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + ) }, ) config_name: Optional[str] = field( - default=None, - metadata={ - "help": - "Pretrained config name or path if not the same as model_name" - }) + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) tokenizer_name: Optional[str] = field( - default=None, - metadata={ - "help": - "Pretrained tokenizer name or path if not the same as model_name" - }) + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) cache_dir: Optional[str] = field( default=None, - metadata={ - "help": - "Where do you want to store the pretrained models downloaded from huggingface.co" - }, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) use_fast_tokenizer: bool = field( default=True, - metadata={ - "help": - "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not." - }, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, ) model_revision: str = field( default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + token: str = field( + default=None, metadata={ - "help": - "The specific model version to use (can be a branch name, tag name or commit id)." + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) }, ) - use_auth_token: bool = field( + trust_remote_code: bool = field( default=False, metadata={ - "help": - ("Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models).") + "help": ( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ) }, ) torch_dtype: Optional[str] = field( default=None, metadata={ - "help": - ("Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " - "dtype will be automatically derived from the model's weights."), + "help": ( + "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " + "dtype will be automatically derived from the model's weights." + ), "choices": ["auto", "bfloat16", "float16", "float32"], }, ) low_cpu_mem_usage: bool = field( default=False, metadata={ - "help": - ("It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded." - "set True will benefit LLM loading time and RAM consumption.") + "help": ( + "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " + "set True will benefit LLM loading time and RAM consumption." + ) }, ) def __post_init__(self): - if self.config_overrides is not None and (self.config_name is not None - or self.model_name_or_path - is not None): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): raise ValueError( "--config_overrides can't be used in combination with --config_name or --model_name_or_path" ) @@ -171,98 +169,75 @@ class DataTrainingArguments: """ dataset_name: Optional[str] = field( - default="wikitext", - metadata={ - "help": - "The name of the dataset to use (via the datasets library)." - }) + default="wikitext", metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) dataset_config_name: Optional[str] = field( - default="wikitext-2-raw-v1", - metadata={ - "help": - "The configuration name of the dataset to use (via the datasets library)." - }) - train_file: Optional[str] = field( - default=None, - metadata={"help": "The input training data file (a text file)."}) + default="wikitext-2-raw-v1", metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) validation_file: Optional[str] = field( default=None, - metadata={ - "help": - "An optional input evaluation data file to evaluate the perplexity on (a text file)." - }, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, ) max_train_samples: Optional[int] = field( default=None, metadata={ - "help": - ("For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set.") + "help": ( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ) }, ) max_eval_samples: Optional[int] = field( default=None, metadata={ - "help": - ("For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set.") + "help": ( + "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + ) }, ) - streaming: bool = field(default=False, - metadata={"help": "Enable streaming mode"}) + streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) block_size: Optional[int] = field( default=1024, metadata={ - "help": - ("Optional input sequence length after tokenization. " - "The training dataset will be truncated in block of this size for training. " - "Default to the model max input length for single sentence inputs (take into account special tokens)." - ) + "help": ( + "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + ) }, ) overwrite_cache: bool = field( - default=False, - metadata={"help": "Overwrite the cached training and evaluation sets"}) + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) validation_split_percentage: Optional[int] = field( default=5, metadata={ - "help": - "The percentage of the train set used as validation set in case there's no validation split" + "help": "The percentage of the train set used as validation set in case there's no validation split" }, ) preprocessing_num_workers: Optional[int] = field( default=None, - metadata={ - "help": "The number of processes to use for the preprocessing." - }, + metadata={"help": "The number of processes to use for the preprocessing."}, ) keep_linebreaks: bool = field( - default=True, - metadata={ - "help": "Whether to keep line breaks when using TXT files or not." - }) + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) def __post_init__(self): if self.streaming: - require_version( - "datasets>=2.0.0", - "The streaming feature requires `datasets>=2.0.0`") + require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError( - "Need either a dataset name or a training/validation file.") + raise ValueError("Need either a dataset name or a training/validation file.") else: if self.train_file is not None: extension = self.train_file.split(".")[-1] - assert extension in [ - "csv", "json", "txt" - ], "`train_file` should be a csv, a json or a txt file." + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." if self.validation_file is not None: extension = self.validation_file.split(".")[-1] - assert extension in [ - "csv", "json", "txt" - ], "`validation_file` should be a csv, a json or a txt file." - + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." def main(): @@ -270,16 +245,13 @@ def main(): # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser( - (ModelArguments, DataTrainingArguments, TrainingArguments)) + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - model_args, data_args, training_args = parser.parse_json_file( - json_file=os.path.abspath(sys.argv[1])) + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses( - ) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. @@ -305,23 +277,20 @@ def main(): # Log on each process the small summary: logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + - f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None - if os.path.isdir( - training_args.output_dir - ) and training_args.do_train and not training_args.overwrite_output_dir: + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir( - training_args.output_dir)) > 0: + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome.") + "Use --overwrite_output_dir to overcome." + ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " @@ -346,9 +315,9 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, streaming=data_args.streaming, - trust_remote_code=True, + trust_remote_code=model_args.trust_remote_code, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( @@ -356,18 +325,18 @@ def main(): data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, streaming=data_args.streaming, - trust_remote_code=True, + trust_remote_code=model_args.trust_remote_code, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, streaming=data_args.streaming, - trust_remote_code=True, + trust_remote_code=model_args.trust_remote_code, ) else: data_files = {} @@ -376,9 +345,11 @@ def main(): data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = (data_args.train_file.split(".")[-1] - if data_args.train_file is not None else - data_args.validation_file.split(".")[-1]) + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = data_args.keep_linebreaks @@ -386,8 +357,7 @@ def main(): extension, data_files=data_files, cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - trust_remote_code=True, + token=model_args.token, **dataset_args, ) # If no validation data is there, validation_split_percentage will be used to divide the dataset. @@ -397,8 +367,7 @@ def main(): data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - trust_remote_code=True, + token=model_args.token, **dataset_args, ) raw_datasets["train"] = load_dataset( @@ -406,13 +375,12 @@ def main(): data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - trust_remote_code=True, + token=model_args.token, **dataset_args, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. + # https://huggingface.co/docs/datasets/loading_datasets. # Load pretrained model and tokenizer # @@ -423,18 +391,16 @@ def main(): config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, } if model_args.config_name: - config = AutoConfig.from_pretrained(model_args.config_name, - **config_kwargs) + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, - **config_kwargs) + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = CONFIG_MAPPING[model_args.model_type]() - logger.warning( - "You are instantiating a new config instance from scratch.") + logger.warning("You are instantiating a new config instance from scratch.") if model_args.config_overrides is not None: logger.info(f"Overriding config: {model_args.config_overrides}") config.update_from_string(model_args.config_overrides) @@ -444,41 +410,41 @@ def main(): "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, } if model_args.tokenizer_name: - tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, - **tokenizer_kwargs) + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) elif model_args.model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained( - model_args.model_name_or_path, **tokenizer_kwargs) + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You are instantiating a new tokenizer from scratch. This is not supported by this script. " "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: - torch_dtype = (model_args.torch_dtype if model_args.torch_dtype in [ - "auto", None - ] else getattr(torch, model_args.torch_dtype)) + torch_dtype = ( + model_args.torch_dtype + if model_args.torch_dtype in ["auto", None] + else getattr(torch, model_args.torch_dtype) + ) model = AutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, torch_dtype=torch_dtype, low_cpu_mem_usage=model_args.low_cpu_mem_usage, ) else: - model = AutoModelForCausalLM.from_config(config) - n_params = sum({p.data_ptr(): p.numel() - for p in model.parameters()}.values()) - logger.info( - f"Training new model from scratch - Total size={n_params/2**20:.2f}M params" - ) + model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code) + n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + # Log number of parameters num_params = get_num_parameters(model) mlflow.log_param("num_params", num_params) @@ -498,8 +464,7 @@ def main(): text_column_name = "text" if "text" in column_names else column_names[0] # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function - tok_logger = transformers.utils.logging.get_logger( - "transformers.tokenization_utils_base") + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") def tokenize_function(examples): with CaptureLogger(tok_logger) as cl: @@ -508,7 +473,8 @@ def tokenize_function(examples): if "Token indices sequence length is longer than the" in cl.out: tok_logger.warning( "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits" - " before being passed to the model.") + " before being passed to the model." + ) return output with training_args.main_process_first(desc="dataset map tokenization"): @@ -522,20 +488,32 @@ def tokenize_function(examples): desc="Running tokenizer on dataset", ) else: - tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=column_names,) + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + remove_columns=column_names, + ) + if hasattr(config, "max_position_embeddings"): + max_pos_embeddings = config.max_position_embeddings + else: + # Define a default value if the attribute is missing in the config. + max_pos_embeddings = 1024 if data_args.block_size is None: block_size = tokenizer.model_max_length - if block_size > 1024: + if block_size > max_pos_embeddings: logger.warning( - "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" - " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" - " override this default with `--block_size xxx`.") - block_size = 1024 + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + f"Using block_size={min(1024, max_pos_embeddings)} instead. You can change that default value by passing --block_size xxx." + ) + if max_pos_embeddings > 0: + block_size = min(1024, max_pos_embeddings) + else: + block_size = 1024 else: if data_args.block_size > tokenizer.model_max_length: logger.warning( - f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model " f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(data_args.block_size, tokenizer.model_max_length) @@ -543,19 +521,14 @@ def tokenize_function(examples): # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. - concatenated_examples = { - k: list(chain(*examples[k])) - for k in examples.keys() - } + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size + # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. + # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. + total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { - k: - [t[i:i + block_size] for i in range(0, total_length, block_size)] + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() @@ -566,7 +539,7 @@ def group_texts(examples): # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map with training_args.main_process_first(desc="grouping texts together"): if not data_args.streaming: @@ -578,15 +551,17 @@ def group_texts(examples): desc=f"Grouping texts in chunks of {block_size}", ) else: - lm_datasets = tokenized_datasets.map(group_texts, batched=True,) + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + ) if training_args.do_train: if "train" not in tokenized_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = lm_datasets["train"] if data_args.max_train_samples is not None: - max_train_samples = min(len(train_dataset), - data_args.max_train_samples) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: @@ -594,8 +569,7 @@ def group_texts(examples): raise ValueError("--do_eval requires a validation dataset") eval_dataset = lm_datasets["validation"] if data_args.max_eval_samples is not None: - max_eval_samples = min(len(eval_dataset), - data_args.max_eval_samples) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) def preprocess_logits_for_metrics(logits, labels): @@ -605,7 +579,7 @@ def preprocess_logits_for_metrics(logits, labels): logits = logits[0] return logits.argmax(dim=-1) - metric = evaluate.load("accuracy") + metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir) def compute_metrics(eval_preds): preds, labels = eval_preds @@ -624,10 +598,10 @@ def compute_metrics(eval_preds): tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it. data_collator=default_data_collator, - compute_metrics=compute_metrics - if training_args.do_eval and not is_torch_tpu_available() else None, + compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None, preprocess_logits_for_metrics=preprocess_logits_for_metrics - if training_args.do_eval and not is_torch_tpu_available() else None, + if training_args.do_eval and not is_torch_xla_available() + else None, ) trainer.add_callback(TBTrainerCallback) @@ -643,9 +617,9 @@ def compute_metrics(eval_preds): metrics = train_result.metrics - max_train_samples = (data_args.max_train_samples - if data_args.max_train_samples is not None else - len(train_dataset)) + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) @@ -658,8 +632,7 @@ def compute_metrics(eval_preds): metrics = trainer.evaluate() - max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( - eval_dataset) + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) try: perplexity = math.exp(metrics["eval_loss"]) @@ -670,18 +643,15 @@ def compute_metrics(eval_preds): trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) - kwargs = { - "finetuned_from": model_args.model_name_or_path, - "tasks": "text-generation" - } + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name - kwargs[ - "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name + mlflow.end_run() if training_args.push_to_hub: trainer.push_to_hub(**kwargs) diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index c13ea9f688d701..a10353abdb8044 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -37,7 +37,7 @@ from accelerate.logging import get_logger from accelerate.utils import set_seed from datasets import load_dataset -from huggingface_hub import Repository, create_repo +from huggingface_hub import HfApi from torch.utils.data import DataLoader from tqdm.auto import tqdm @@ -52,16 +52,16 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry +from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.29.0") +check_min_version("4.44.0") logger = get_logger(__name__) -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") +require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) @@ -82,10 +82,10 @@ def parse_args(): help="The configuration name of the dataset to use (via the datasets library).", ) parser.add_argument( - "--train_file", type=str, default=None, help="A csv or a json file containing the training data." + "--train_file", type=str, default=None, help="A csv, txt or a json file containing the training data." ) parser.add_argument( - "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data." + "--validation_file", type=str, default=None, help="A csv, txt or a json file containing the validation data." ) parser.add_argument( "--validation_split_percentage", @@ -193,6 +193,15 @@ def parse_args(): "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--trust_remote_code", + action="store_true", + help=( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ), + ) parser.add_argument( "--checkpointing_steps", type=str, @@ -216,7 +225,7 @@ def parse_args(): default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations. ' "Only applicable when `--with_tracking` is passed." ), ) @@ -224,7 +233,7 @@ def parse_args(): "--low_cpu_mem_usage", action="store_true", help=( - "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded." + "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " "If passed, LLM loading time and RAM consumption will be benefited." ), ) @@ -236,13 +245,16 @@ def parse_args(): else: if args.train_file is not None: extension = args.train_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file." + if extension not in ["csv", "json", "txt"]: + raise ValueError("`train_file` should be a csv, json or txt file.") if args.validation_file is not None: extension = args.validation_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file." + if extension not in ["csv", "json", "txt"]: + raise ValueError("`validation_file` should be a csv, json or txt file.") if args.push_to_hub: - assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed." + if args.output_dir is None: + raise ValueError("Need an `output_dir` to create a repo when `--push_to_hub` is passed.") return args @@ -261,7 +273,7 @@ def main(): if args.with_tracking: accelerator_log_kwargs["log_with"] = args.report_to - accelerator_log_kwargs["logging_dir"] = args.output_dir + accelerator_log_kwargs["project_dir"] = args.output_dir accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) @@ -286,12 +298,13 @@ def main(): # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: - if args.hub_model_id is None: - repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) - else: - repo_name = args.hub_model_id - create_repo(repo_name, exist_ok=True, token=args.hub_token) - repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token) + # Retrieve of infer repo_name + repo_name = args.hub_model_id + if repo_name is None: + repo_name = Path(args.output_dir).absolute().name + # Create repo and retrieve repo_id + api = HfApi() + repo_id = api.create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: @@ -313,26 +326,31 @@ def main(): # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + raw_datasets = load_dataset( + args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code + ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[:{args.validation_split_percentage}%]", + trust_remote_code=args.trust_remote_code, ) raw_datasets["train"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[{args.validation_split_percentage}%:]", + trust_remote_code=args.trust_remote_code, ) else: data_files = {} dataset_args = {} if args.train_file is not None: data_files["train"] = args.train_file + extension = args.train_file.split(".")[-1] if args.validation_file is not None: data_files["validation"] = args.validation_file - extension = args.train_file.split(".")[-1] + extension = args.validation_file.split(".")[-1] if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks @@ -353,27 +371,37 @@ def main(): ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. + # https://huggingface.co/docs/datasets/loading_datasets. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: - config = AutoConfig.from_pretrained(args.config_name) + config = AutoConfig.from_pretrained( + args.config_name, + trust_remote_code=args.trust_remote_code, + ) elif args.model_name_or_path: - config = AutoConfig.from_pretrained(args.model_name_or_path) + config = AutoConfig.from_pretrained( + args.model_name_or_path, + trust_remote_code=args.trust_remote_code, + ) else: config = CONFIG_MAPPING[args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if args.tokenizer_name: - tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer) + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code + ) elif args.model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) + tokenizer = AutoTokenizer.from_pretrained( + args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code + ) else: raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You are instantiating a new tokenizer from scratch. This is not supported by this script. " "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) @@ -383,10 +411,11 @@ def main(): from_tf=bool(".ckpt" in args.model_name_or_path), config=config, low_cpu_mem_usage=args.low_cpu_mem_usage, + trust_remote_code=args.trust_remote_code, ) else: logger.info("Training new model from scratch") - model = AutoModelForCausalLM.from_config(config) + model = AutoModelForCausalLM.from_config(config, trust_remote_code=args.trust_remote_code) # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. @@ -414,17 +443,16 @@ def tokenize_function(examples): if args.block_size is None: block_size = tokenizer.model_max_length - if block_size > 1024: + if block_size > config.max_position_embeddings: logger.warning( - "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" - " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" - " override this default with `--block_size xxx`." + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx." ) - block_size = 1024 + block_size = min(1024, config.max_position_embeddings) else: if args.block_size > tokenizer.model_max_length: logger.warning( - f"The block_size passed ({args.block_size}) is larger than the maximum length for the model" + f"The block_size passed ({args.block_size}) is larger than the maximum length for the model " f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(args.block_size, tokenizer.model_max_length) @@ -434,10 +462,9 @@ def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size + # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. + # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. + total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] @@ -451,7 +478,7 @@ def group_texts(examples): # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + # https://huggingface.co/docs/datasets/process#map with accelerator.main_process_first(): lm_datasets = tokenized_datasets.map( @@ -502,8 +529,10 @@ def group_texts(examples): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, - num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + num_warmup_steps=args.num_warmup_steps * accelerator.num_processes, + num_training_steps=args.max_train_steps + if overrode_max_train_steps + else args.max_train_steps * accelerator.num_processes, ) # Prepare everything with our `accelerator`. @@ -553,43 +582,45 @@ def group_texts(examples): # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": - accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") - accelerator.load_state(args.resume_from_checkpoint) + checkpoint_path = args.resume_from_checkpoint path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last + checkpoint_path = path + path = os.path.basename(checkpoint_path) + + accelerator.print(f"Resumed from checkpoint: {checkpoint_path}") + accelerator.load_state(checkpoint_path) # Extract `epoch_{i}` or `step_{i}` training_difference = os.path.splitext(path)[0] if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None + completed_steps = starting_epoch * num_update_steps_per_epoch else: # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) + completed_steps = resume_step // args.gradient_accumulation_steps resume_step -= starting_epoch * len(train_dataloader) # update the progress_bar if load from checkpoint - progress_bar.update(starting_epoch * num_update_steps_per_epoch) - completed_steps = starting_epoch * num_update_steps_per_epoch + progress_bar.update(completed_steps) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 - for step, batch in enumerate(train_dataloader): - # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == starting_epoch: - if resume_step is not None and step < resume_step: - if step % args.gradient_accumulation_steps == 0: - progress_bar.update(1) - completed_steps += 1 - continue - + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss @@ -608,7 +639,7 @@ def group_texts(examples): if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: - output_dir = f"step_{completed_steps }" + output_dir = f"step_{completed_steps}" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) @@ -653,8 +684,12 @@ def group_texts(examples): ) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) - repo.push_to_hub( - commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True + api.upload_folder( + commit_message=f"Training in progress epoch {epoch}", + folder_path=args.output_dir, + repo_id=repo_id, + repo_type="model", + token=args.hub_token, ) if args.checkpointing_steps == "epoch": @@ -675,8 +710,13 @@ def group_texts(examples): if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: - repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) - + api.upload_folder( + commit_message="End of training", + folder_path=args.output_dir, + repo_id=repo_id, + repo_type="model", + token=args.hub_token, + ) with open(os.path.join(args.output_dir, "all_results.json"), "w") as f: json.dump({"perplexity": perplexity}, f)