diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000000..942311cffc --- /dev/null +++ b/docs/README.md @@ -0,0 +1,58 @@ +# Generating the documentation + +To generate the documentation, you have to build it. Several packages are necessary to build the doc. + +First, you need to install the project itself by running the following command at the root of the code repository: + +```bash +pip install -e . +``` + +You also need to install 2 extra packages: + +```bash +# `hf-doc-builder` to build the docs +pip install git+https://github.com/huggingface/doc-builder@main +# `watchdog` for live reloads +pip install watchdog +``` + +--- +**NOTE** + +You only need to generate the documentation to inspect it locally (if you're planning changes and want to +check how they look before committing for instance). You don't have to commit the built documentation. + +--- + +## Building the documentation + +Once you have setup the `doc-builder` and additional packages with the pip install command above, +you can generate the documentation by typing the following command: + +```bash +doc-builder build autotrain docs/source/ --build_dir ~/tmp/test-build +``` + +You can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate +the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite +Markdown editor. + +## Previewing the documentation + +To preview the docs, run the following command: + +```bash +doc-builder preview autotrain docs/source/ +``` + +The docs will be viewable at [http://localhost:5173](http://localhost:5173). You can also preview the docs once you +have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives. + +--- +**NOTE** + +The `preview` command only works with existing doc files. When you add a completely new file, you need to update +`_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again). + +--- \ No newline at end of file diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index be055920a1..aee205c5dd 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -25,50 +25,50 @@ title: AutoTrain API title: Miscellaneous - sections: - - local: text_classification + - local: tasks/text_classification title: Text Classification - - local: extractive_qa + - local: tasks/extractive_qa title: Extractive QA - - local: sentence_transformer + - local: tasks/sentence_transformer title: Sentence Transformer - - local: text_regression + - local: tasks/text_regression title: Text Regression - - local: llm_finetuning + - local: tasks/llm_finetuning title: LLM Finetuning - - local: image_classification + - local: tasks/image_classification title: Image Classification - - local: image_regression + - local: tasks/image_regression title: Image Scoring/Regression - - local: object_detection + - local: tasks/object_detection title: Object Detection - - local: dreambooth + - local: tasks/dreambooth title: DreamBooth - - local: seq2seq + - local: tasks/seq2seq title: Seq2Seq - - local: token_classification + - local: tasks/token_classification title: Token Classification - - local: tabular + - local: tasks/tabular title: Tabular title: Data Formats - sections: - - local: text_classification_params + - local: params/text_classification_params title: Text Classification & Regression - - local: extractive_qa_params + - local: params/extractive_qa_params title: Extractive QA - - local: llm_finetuning_params + - local: params/llm_finetuning_params title: LLM Finetuning - - local: image_classification_params + - local: params/image_classification_params title: Image Classification - - local: image_regression_params + - local: params/image_regression_params title: Image Scoring/Regression - - local: object_detection_params + - local: params/object_detection_params title: Object Detection - - local: dreambooth_params + - local: params/dreambooth_params title: DreamBooth - - local: seq2seq_params + - local: params/seq2seq_params title: Seq2Seq - - local: token_classification_params + - local: params/token_classification_params title: Token Classification - - local: tabular_params + - local: params/tabular_params title: Tabular title: Parameters \ No newline at end of file diff --git a/docs/source/dreambooth_params.mdx b/docs/source/dreambooth_params.mdx deleted file mode 100644 index 7dc3fd9cd8..0000000000 --- a/docs/source/dreambooth_params.mdx +++ /dev/null @@ -1,81 +0,0 @@ -## DreamBooth Parameters - -``` - --batch-size BATCH_SIZE - Training batch size to use - --seed SEED Random seed for reproducibility - --epochs EPOCHS Number of training epochs - --gradient_accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps - --disable_gradient_checkpointing - Disable gradient checkpointing - --lr LR Learning rate - --tokenizer TOKENIZER - Tokenizer to use for training - --class-image-path CLASS_IMAGE_PATH - Path to the class images - --prompt PROMPT Instance prompt - --prior-preservation With prior preservation - --prior-loss-weight PRIOR_LOSS_WEIGHT - Prior loss weight - --resolution RESOLUTION - Resolution - --center-crop Center crop - --train-text-encoder Train text encoder - --sample-batch-size SAMPLE_BATCH_SIZE - Sample batch size - --num-steps NUM_STEPS - Max train steps - --checkpointing-steps CHECKPOINTING_STEPS - Checkpointing steps - --resume-from-checkpoint RESUME_FROM_CHECKPOINT - Resume from checkpoint - --scale-lr Scale learning rate - --scheduler SCHEDULER - Learning rate scheduler - --warmup-steps WARMUP_STEPS - Learning rate warmup steps - --num-cycles NUM_CYCLES - Learning rate num cycles - --lr-power LR_POWER Learning rate power - --dataloader-num-workers DATALOADER_NUM_WORKERS - Dataloader num workers - --use-8bit-adam Use 8bit adam - --adam-beta1 ADAM_BETA1 - Adam beta 1 - --adam-beta2 ADAM_BETA2 - Adam beta 2 - --adam-weight-decay ADAM_WEIGHT_DECAY - Adam weight decay - --adam-epsilon ADAM_EPSILON - Adam epsilon - --max-grad-norm MAX_GRAD_NORM - Max grad norm - --allow-tf32 Allow TF32 - --prior-generation-precision PRIOR_GENERATION_PRECISION - Prior generation precision - --local-rank LOCAL_RANK - Local rank - --xformers Enable xformers memory efficient attention - --pre-compute-text-embeddings - Pre compute text embeddings - --tokenizer-max-length TOKENIZER_MAX_LENGTH - Tokenizer max length - --text-encoder-use-attention-mask - Text encoder use attention mask - --rank RANK Rank - --xl XL - --mixed-precision MIXED_PRECISION - mixed precision, fp16, bf16, none - --validation-prompt VALIDATION_PROMPT - Validation prompt - --num-validation-images NUM_VALIDATION_IMAGES - Number of validation images - --validation-epochs VALIDATION_EPOCHS - Validation epochs - --checkpoints-total-limit CHECKPOINTS_TOTAL_LIMIT - Checkpoints total limit - --validation-images VALIDATION_IMAGES - Validation images - --logging Logging using tensorboard -``` \ No newline at end of file diff --git a/docs/source/extractive_qa_params.mdx b/docs/source/extractive_qa_params.mdx deleted file mode 100644 index 1952ab7754..0000000000 --- a/docs/source/extractive_qa_params.mdx +++ /dev/null @@ -1,57 +0,0 @@ -# Extractive Question Answering Parameters - -``` ---batch-size BATCH_SIZE - Training batch size to use ---seed SEED Random seed for reproducibility ---epochs EPOCHS Number of training epochs ---gradient_accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps ---disable_gradient_checkpointing - Disable gradient checkpointing ---lr LR Learning rate ---log {none,wandb,tensorboard} - Use experiment tracking ---text-column TEXT_COLUMN - Specify the column name in the dataset that contains the text data. Useful for distinguishing between multiple text fields. - Default is 'text'. ---target-column TARGET_COLUMN - Specify the column name that holds the target or label data for training. Helps in distinguishing different potential - outputs. Default is 'target'. ---max-seq-length MAX_SEQ_LENGTH - Set the maximum sequence length (number of tokens) that the model should handle in a single input. Longer sequences are - truncated. Affects both memory usage and computational requirements. Default is 128 tokens. ---max-doc-stride MAX_DOC_STRIDE - Set the maximum stride length (number of tokens) that the model should use to slide the window over the document. Default - is 128 tokens. ---warmup-ratio WARMUP_RATIO - Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help - in stabilizing the training process early on. Default ratio is 0.1. ---optimizer OPTIMIZER - Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model - performance. 'adamw_torch' is used by default. ---scheduler SCHEDULER - Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the - learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. ---weight-decay WEIGHT_DECAY - Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large - weights. Default is 0.0, meaning no weight decay is applied. ---max-grad-norm MAX_GRAD_NORM - Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient - problem in deep neural networks. Default is 1.0. ---logging-steps LOGGING_STEPS - Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging - steps automatically. Default is -1. ---eval-strategy {steps,epoch,no} - Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of - each training epoch by default. ---save-total-limit SAVE_TOTAL_LIMIT - Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. - Default is to save only the latest one. ---auto-find-batch-size - Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch - size that fits in memory. ---mixed-precision {fp16,bf16,None} - Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for - default precision. Default is None. -``` \ No newline at end of file diff --git a/docs/source/image_classification_params.mdx b/docs/source/image_classification_params.mdx deleted file mode 100644 index ce43c1ff3c..0000000000 --- a/docs/source/image_classification_params.mdx +++ /dev/null @@ -1,49 +0,0 @@ -# Image Classification Parameters - -``` ---batch-size BATCH_SIZE - Training batch size to use ---seed SEED Random seed for reproducibility ---epochs EPOCHS Number of training epochs ---gradient_accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps ---disable_gradient_checkpointing - Disable gradient checkpointing ---lr LR Learning rate ---log {none,wandb,tensorboard} - Use experiment tracking ---image-column IMAGE_COLUMN - Image column to use ---target-column TARGET_COLUMN - Target column to use ---warmup-ratio WARMUP_RATIO - Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help - in stabilizing the training process early on. Default ratio is 0.1. ---optimizer OPTIMIZER - Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model - performance. 'adamw_torch' is used by default. ---scheduler SCHEDULER - Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the - learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. ---weight-decay WEIGHT_DECAY - Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large - weights. Default is 0.0, meaning no weight decay is applied. ---max-grad-norm MAX_GRAD_NORM - Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient - problem in deep neural networks. Default is 1.0. ---logging-steps LOGGING_STEPS - Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging - steps automatically. Default is -1. ---eval-strategy {steps,epoch,no} - Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of - each training epoch by default. ---save-total-limit SAVE_TOTAL_LIMIT - Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. - Default is to save only the latest one. ---auto-find-batch-size - Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch - size that fits in memory. ---mixed-precision {fp16,bf16,None} - Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for - default precision. Default is None. -``` \ No newline at end of file diff --git a/docs/source/object_detection_params.mdx b/docs/source/object_detection_params.mdx deleted file mode 100644 index 66c7617132..0000000000 --- a/docs/source/object_detection_params.mdx +++ /dev/null @@ -1,52 +0,0 @@ -# Object Detection Parameters - -``` ---image-square-size IMAGE_SQUARE_SIZE - Resize the input images to a square shape with the specified size. This is useful for models that require square input images. - Default is 600. ---batch-size BATCH_SIZE - Training batch size to use ---seed SEED Random seed for reproducibility ---epochs EPOCHS Number of training epochs ---gradient_accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps ---disable_gradient_checkpointing - Disable gradient checkpointing ---lr LR Learning rate ---log {none,wandb,tensorboard} - Use experiment tracking ---image-column IMAGE_COLUMN - Image column to use ---target-column TARGET_COLUMN - Target column to use ---warmup-ratio WARMUP_RATIO - Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help - in stabilizing the training process early on. Default ratio is 0.1. ---optimizer OPTIMIZER - Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model - performance. 'adamw_torch' is used by default. ---scheduler SCHEDULER - Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the - learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. ---weight-decay WEIGHT_DECAY - Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large - weights. Default is 0.0, meaning no weight decay is applied. ---max-grad-norm MAX_GRAD_NORM - Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient - problem in deep neural networks. Default is 1.0. ---logging-steps LOGGING_STEPS - Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging - steps automatically. Default is -1. ---eval-strategy {steps,epoch,no} - Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of - each training epoch by default. ---save-total-limit SAVE_TOTAL_LIMIT - Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. - Default is to save only the latest one. ---auto-find-batch-size - Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch - size that fits in memory. ---mixed-precision {fp16,bf16,None} - Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for - default precision. Default is None. -``` \ No newline at end of file diff --git a/docs/source/params/dreambooth_params.mdx b/docs/source/params/dreambooth_params.mdx new file mode 100644 index 0000000000..0c0056d2b4 --- /dev/null +++ b/docs/source/params/dreambooth_params.mdx @@ -0,0 +1,3 @@ +# DreamBooth Parameters + +[[autodoc]] trainers.dreambooth.params.DreamBoothTrainingParams \ No newline at end of file diff --git a/docs/source/params/extractive_qa_params.mdx b/docs/source/params/extractive_qa_params.mdx new file mode 100644 index 0000000000..de8e18871a --- /dev/null +++ b/docs/source/params/extractive_qa_params.mdx @@ -0,0 +1,3 @@ +# Extractive Question Answering Parameters + +[[autodoc]] trainers.extractive_question_answering.params.ExtractiveQuestionAnsweringParams \ No newline at end of file diff --git a/docs/source/params/image_classification_params.mdx b/docs/source/params/image_classification_params.mdx new file mode 100644 index 0000000000..55eebd8914 --- /dev/null +++ b/docs/source/params/image_classification_params.mdx @@ -0,0 +1,3 @@ +# Image Classification Parameters + +[[autodoc]] trainers.image_classification.params.ImageClassificationParams \ No newline at end of file diff --git a/docs/source/image_regression_params.mdx b/docs/source/params/image_regression_params.mdx similarity index 66% rename from docs/source/image_regression_params.mdx rename to docs/source/params/image_regression_params.mdx index 8434a0ab9d..8a94f41e68 100644 --- a/docs/source/image_regression_params.mdx +++ b/docs/source/params/image_regression_params.mdx @@ -1,3 +1,5 @@ # Image Scoring/Regression Parameters The Parameters for image scoring/regression are same as the parameters for image classification. + +[[autodoc]] trainers.image_regression.params.ImageRegressionParams \ No newline at end of file diff --git a/docs/source/llm_finetuning_params.mdx b/docs/source/params/llm_finetuning_params.mdx similarity index 52% rename from docs/source/llm_finetuning_params.mdx rename to docs/source/params/llm_finetuning_params.mdx index ce0e89653b..db67624095 100644 --- a/docs/source/llm_finetuning_params.mdx +++ b/docs/source/params/llm_finetuning_params.mdx @@ -1,5 +1,7 @@ # LLM Fine Tuning Parameters +[[autodoc]] trainers.clm.params.LLMTrainingParams + ## Task specific parameters @@ -91,72 +93,3 @@ The length parameters used for different trainers can be different. Some require --max_completion_length MAX_COMPLETION_LENGTH, --max-completion-length MAX_COMPLETION_LENGTH Completion length to use, for orpo: encoder-decoder models only ``` - - -## Common parameters -``` ---batch-size BATCH_SIZE, --train-batch-size BATCH_SIZE - Training batch size to use ---seed SEED Random seed for reproducibility ---epochs EPOCHS Number of training epochs ---gradient_accumulation GRADIENT_ACCUMULATION, --gradient-accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps ---disable_gradient_checkpointing, --disable-gradient-checkpointing, --disable-gc - Disable gradient checkpointing ---lr LR Learning rate ---log {none,wandb,tensorboard} - Use experiment tracking ---warmup_ratio WARMUP_RATIO, --warmup-ratio WARMUP_RATIO - Set the proportion of training allocated to warming up the learning rate, which can enhance model stability and performance - at the start of training. Default is 0.1 ---optimizer OPTIMIZER - Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model - performance. 'adamw_torch' is used by default. ---scheduler SCHEDULER - Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the - learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. ---weight_decay WEIGHT_DECAY, --weight-decay WEIGHT_DECAY - Define the weight decay rate for regularization, which helps prevent overfitting by penalizing larger weights. Default is - 0.0 ---max_grad_norm MAX_GRAD_NORM, --max-grad-norm MAX_GRAD_NORM - Set the maximum norm for gradient clipping, which is critical for preventing gradients from exploding during - backpropagation. Default is 1.0. ---peft, --use-peft Enable LoRA-PEFT ---lora_r LORA_R, --lora-r LORA_R - Set the 'r' parameter for Low-Rank Adaptation (LoRA). Default is 16. ---lora_alpha LORA_ALPHA, --lora-alpha LORA_ALPHA - Specify the 'alpha' parameter for LoRA. Default is 32. ---lora_dropout LORA_DROPOUT, --lora-dropout LORA_DROPOUT - Set the dropout rate within the LoRA layers to help prevent overfitting during adaptation. Default is 0.05. ---logging_steps LOGGING_STEPS, --logging-steps LOGGING_STEPS - Determine how often to log training progress in terms of steps. Setting it to '-1' determines logging steps automatically. ---eval_strategy {epoch,steps,no}, --eval-strategy {epoch,steps,no} - Choose how frequently to evaluate the model's performance, with 'epoch' as the default, meaning at the end of each training - epoch ---save_total_limit SAVE_TOTAL_LIMIT, --save-total-limit SAVE_TOTAL_LIMIT - Limit the total number of saved model checkpoints to manage disk usage effectively. Default is to save only the latest - checkpoint ---auto_find_batch_size, --auto-find-batch-size - Automatically determine the optimal batch size based on system capabilities to maximize efficiency. ---mixed_precision {fp16,bf16,None}, --mixed-precision {fp16,bf16,None} - Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for - default precision. Default is None. ---quantization {int4,int8,None}, --quantization {int4,int8,None} - Choose the quantization level to reduce model size and potentially increase inference speed. Options include 'int4', 'int8', - or None. Enabling requires --peft ---trainer {default,dpo,sft,orpo,reward} - Trainer type to use ---target_modules TARGET_MODULES, --target-modules TARGET_MODULES - Identify specific modules within the model architecture to target with adaptations or optimizations, such as LoRA. Comma - separated list of module names. Default is 'all-linear'. ---merge_adapter, --merge-adapter - Use this flag to merge PEFT adapter with the model ---use_flash_attention_2, --use-flash-attention-2, --use-fa2 - Use flash attention 2 ---chat_template {tokenizer,chatml,zephyr,None}, --chat-template {tokenizer,chatml,zephyr,None} - Apply a specific template for chat-based interactions, with options including 'tokenizer', 'chatml', 'zephyr', or None. This - setting can shape the model's conversational behavior. ---padding {left,right,None}, --padding {left,right,None} - Specify the padding direction for sequences, critical for models sensitive to input alignment. Options include 'left', - 'right', or None -``` \ No newline at end of file diff --git a/docs/source/params/object_detection_params.mdx b/docs/source/params/object_detection_params.mdx new file mode 100644 index 0000000000..59cee7acad --- /dev/null +++ b/docs/source/params/object_detection_params.mdx @@ -0,0 +1,3 @@ +# Object Detection Parameters + +[[autodoc]] trainers.object_detection.params.ObjectDetectionParams diff --git a/docs/source/params/seq2seq_params.mdx b/docs/source/params/seq2seq_params.mdx new file mode 100644 index 0000000000..82754114bf --- /dev/null +++ b/docs/source/params/seq2seq_params.mdx @@ -0,0 +1,3 @@ +# Seq2Seq Parameters + +[[autodoc]] trainers.seq2seq.params.Seq2SeqParams diff --git a/docs/source/params/tabular_params.mdx b/docs/source/params/tabular_params.mdx new file mode 100644 index 0000000000..c99608bd58 --- /dev/null +++ b/docs/source/params/tabular_params.mdx @@ -0,0 +1,3 @@ +# Tabular Parameters + +[[autodoc]] trainers.tabular.params.TabularParams diff --git a/docs/source/params/text_classification_params.mdx b/docs/source/params/text_classification_params.mdx new file mode 100644 index 0000000000..5c07792369 --- /dev/null +++ b/docs/source/params/text_classification_params.mdx @@ -0,0 +1,3 @@ +# Text Classification & Regression Parameters + +[[autodoc]] trainers.text_classification.params.TextClassificationParams diff --git a/docs/source/params/token_classification_params.mdx b/docs/source/params/token_classification_params.mdx new file mode 100644 index 0000000000..00fc66b18b --- /dev/null +++ b/docs/source/params/token_classification_params.mdx @@ -0,0 +1,3 @@ +# Token Classification Parameters + +[[autodoc]] trainers.token_classification.params.TokenClassificationParams diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index 738078c184..51028cc54c 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -76,7 +76,7 @@ options: For more information about a command, run: `autotrain --help` ``` -It is advised to use `autotrain --config CONFIG_FILE` command when using the CLI. +It is advised to use only the `autotrain --config CONFIG_FILE` command for training when using the CLI. The autotrain commands that end users will be interested in are: diff --git a/docs/source/seq2seq_params.mdx b/docs/source/seq2seq_params.mdx deleted file mode 100644 index 68873ca24e..0000000000 --- a/docs/source/seq2seq_params.mdx +++ /dev/null @@ -1,70 +0,0 @@ -# Seq2Seq Parameters - -``` ---batch-size BATCH_SIZE - Training batch size to use ---seed SEED Random seed for reproducibility ---epochs EPOCHS Number of training epochs ---gradient_accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps ---disable_gradient_checkpointing - Disable gradient checkpointing ---lr LR Learning rate ---log {none,wandb,tensorboard} - Use experiment tracking ---text-column TEXT_COLUMN - Specify the column name in the dataset that contains the text data. Useful for distinguishing between multiple text fields. - Default is 'text'. ---target-column TARGET_COLUMN - Specify the column name that holds the target data for training. Helps in distinguishing different potential outputs. - Default is 'target'. ---max-seq-length MAX_SEQ_LENGTH - Set the maximum sequence length (number of tokens) that the model should handle in a single input. Longer sequences are - truncated. Affects both memory usage and computational requirements. Default is 128 tokens. ---max-target-length MAX_TARGET_LENGTH - Define the maximum number of tokens for the target sequence in each input. Useful for models that generate outputs, ensuring - uniformity in sequence length. Default is set to 128 tokens. ---warmup-ratio WARMUP_RATIO - Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help - in stabilizing the training process early on. Default ratio is 0.1. ---optimizer OPTIMIZER - Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model - performance. 'adamw_torch' is used by default. ---scheduler SCHEDULER - Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the - learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. ---weight-decay WEIGHT_DECAY - Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large - weights. Default is 0.0, meaning no weight decay is applied. ---max-grad-norm MAX_GRAD_NORM - Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient - problem in deep neural networks. Default is 1.0. ---logging-steps LOGGING_STEPS - Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging - steps automatically. Default is -1. ---eval-strategy eval_strategy - Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of - each training epoch by default. ---save-total-limit SAVE_TOTAL_LIMIT - Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. - Default is to save only the latest one. ---auto-find-batch-size - Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch - size that fits in memory. ---mixed-precision {fp16,bf16,None} - Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for - default precision. Default is None. ---peft Enable LoRA-PEFT ---quantization {int8,None} - Select the quantization mode to reduce model size and potentially increase inference speed. Options include 'int8' for 8-bit - integer quantization or None for no quantization. Default is None ---lora-r LORA_R Set the rank 'R' for the LoRA (Low-Rank Adaptation) technique. Default is 16. ---lora-alpha LORA_ALPHA - Specify the 'Alpha' parameter for LoRA. Default is 32. ---lora-dropout LORA_DROPOUT - Determine the dropout rate to apply in the LoRA layers, which can help in preventing overfitting by randomly disabling a - fraction of neurons during training. Default rate is 0.05. ---target-modules TARGET_MODULES - List the modules within the model architecture that should be targeted for specific techniques such as LoRA adaptations. - Useful for fine-tuning particular components of large models. By default all linear layers are targeted. -``` diff --git a/docs/source/tabular_params.mdx b/docs/source/tabular_params.mdx deleted file mode 100644 index a2dd7fd1b6..0000000000 --- a/docs/source/tabular_params.mdx +++ /dev/null @@ -1,38 +0,0 @@ -# Tabular Parameters - -``` ---batch-size BATCH_SIZE - Training batch size to use ---seed SEED Random seed for reproducibility ---target-columns TARGET_COLUMNS - Specify the names of the target or label columns separated by commas if multiple. These columns are what the model will - predict. Required for defining the output of the model. ---categorical-columns CATEGORICAL_COLUMNS - List the names of columns that contain categorical data, useful for models that need explicit handling of such data. - Categorical data is typically processed differently from numerical data, such as through encoding. If not specified, the - model will infer the data type. ---numerical-columns NUMERICAL_COLUMNS - Identify columns that contain numerical data. Proper specification helps in applying appropriate scaling and normalization - techniques, which can significantly impact model performance. If not specified, the model will infer the data type. ---id-column ID_COLUMN - Specify the column name that uniquely identifies each row in the dataset. This is critical for tracking samples through the - model pipeline and is often excluded from model training. Required field. ---task {classification,regression} - Define the type of machine learning task, such as 'classification', 'regression'. This parameter determines the model's - architecture and the loss function to use. Required to properly configure the model. ---num-trials NUM_TRIALS - Set the number of trials for hyperparameter tuning or model experimentation. More trials can lead to better model - configurations but require more computational resources. Default is 100 trials. ---time-limit TIME_LIMIT - mpose a time limit (in seconds) for training or searching for the best model configuration. This helps manage resource - allocation and ensures the process does not exceed available computational budgets. The default is 3600 seconds (1 hour). ---categorical-imputer {most_frequent,None} - Select the method or strategy to impute missing values in categorical columns. Options might include 'most_frequent', - 'None'. Correct imputation can prevent biases and improve model accuracy. ---numerical-imputer {mean,median,None} - Choose the imputation strategy for missing values in numerical columns. Common strategies include 'mean', & 'median'. - Accurate imputation is vital for maintaining the integrity of numerical data. ---numeric-scaler {standard,minmax,normal,robust} - Determine the type of scaling to apply to numerical data. Examples include 'standard' (zero mean and unit variance), 'min- - max' (scaled between given range), etc. Scaling is essential for many algorithms to perform optimally -``` diff --git a/docs/source/dreambooth.mdx b/docs/source/tasks/dreambooth.mdx similarity index 100% rename from docs/source/dreambooth.mdx rename to docs/source/tasks/dreambooth.mdx diff --git a/docs/source/extractive_qa.mdx b/docs/source/tasks/extractive_qa.mdx similarity index 100% rename from docs/source/extractive_qa.mdx rename to docs/source/tasks/extractive_qa.mdx diff --git a/docs/source/image_classification.mdx b/docs/source/tasks/image_classification.mdx similarity index 100% rename from docs/source/image_classification.mdx rename to docs/source/tasks/image_classification.mdx diff --git a/docs/source/image_regression.mdx b/docs/source/tasks/image_regression.mdx similarity index 100% rename from docs/source/image_regression.mdx rename to docs/source/tasks/image_regression.mdx diff --git a/docs/source/llm_finetuning.mdx b/docs/source/tasks/llm_finetuning.mdx similarity index 100% rename from docs/source/llm_finetuning.mdx rename to docs/source/tasks/llm_finetuning.mdx diff --git a/docs/source/object_detection.mdx b/docs/source/tasks/object_detection.mdx similarity index 100% rename from docs/source/object_detection.mdx rename to docs/source/tasks/object_detection.mdx diff --git a/docs/source/sentence_transformer.mdx b/docs/source/tasks/sentence_transformer.mdx similarity index 100% rename from docs/source/sentence_transformer.mdx rename to docs/source/tasks/sentence_transformer.mdx diff --git a/docs/source/seq2seq.mdx b/docs/source/tasks/seq2seq.mdx similarity index 100% rename from docs/source/seq2seq.mdx rename to docs/source/tasks/seq2seq.mdx diff --git a/docs/source/tabular.mdx b/docs/source/tasks/tabular.mdx similarity index 100% rename from docs/source/tabular.mdx rename to docs/source/tasks/tabular.mdx diff --git a/docs/source/text_classification.mdx b/docs/source/tasks/text_classification.mdx similarity index 100% rename from docs/source/text_classification.mdx rename to docs/source/tasks/text_classification.mdx diff --git a/docs/source/text_regression.mdx b/docs/source/tasks/text_regression.mdx similarity index 100% rename from docs/source/text_regression.mdx rename to docs/source/tasks/text_regression.mdx diff --git a/docs/source/token_classification.mdx b/docs/source/tasks/token_classification.mdx similarity index 100% rename from docs/source/token_classification.mdx rename to docs/source/tasks/token_classification.mdx diff --git a/docs/source/text_classification_params.mdx b/docs/source/text_classification_params.mdx deleted file mode 100644 index c98c219a45..0000000000 --- a/docs/source/text_classification_params.mdx +++ /dev/null @@ -1,54 +0,0 @@ -# Text Classification & Regression Parameters - -``` ---batch-size BATCH_SIZE - Training batch size to use ---seed SEED Random seed for reproducibility ---epochs EPOCHS Number of training epochs ---gradient_accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps ---disable_gradient_checkpointing - Disable gradient checkpointing ---lr LR Learning rate ---log {none,wandb,tensorboard} - Use experiment tracking ---text-column TEXT_COLUMN - Specify the column name in the dataset that contains the text data. Useful for distinguishing between multiple text fields. - Default is 'text'. ---target-column TARGET_COLUMN - Specify the column name that holds the target or label data for training. Helps in distinguishing different potential - outputs. Default is 'target'. ---max-seq-length MAX_SEQ_LENGTH - Set the maximum sequence length (number of tokens) that the model should handle in a single input. Longer sequences are - truncated. Affects both memory usage and computational requirements. Default is 128 tokens. ---warmup-ratio WARMUP_RATIO - Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help - in stabilizing the training process early on. Default ratio is 0.1. ---optimizer OPTIMIZER - Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model - performance. 'adamw_torch' is used by default. ---scheduler SCHEDULER - Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the - learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. ---weight-decay WEIGHT_DECAY - Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large - weights. Default is 0.0, meaning no weight decay is applied. ---max-grad-norm MAX_GRAD_NORM - Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient - problem in deep neural networks. Default is 1.0. ---logging-steps LOGGING_STEPS - Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging - steps automatically. Default is -1. ---eval-strategy {steps,epoch,no} - Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of - each training epoch by default. ---save-total-limit SAVE_TOTAL_LIMIT - Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. - Default is to save only the latest one. ---auto-find-batch-size - Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch - size that fits in memory. ---mixed-precision {fp16,bf16,None} - Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for - default precision. Default is None. -``` \ No newline at end of file diff --git a/docs/source/token_classification_params.mdx b/docs/source/token_classification_params.mdx deleted file mode 100644 index 010e16ab37..0000000000 --- a/docs/source/token_classification_params.mdx +++ /dev/null @@ -1,52 +0,0 @@ -# Token Classification Parameters - -``` ---batch-size BATCH_SIZE - Training batch size to use ---seed SEED Random seed for reproducibility ---epochs EPOCHS Number of training epochs ---gradient_accumulation GRADIENT_ACCUMULATION - Gradient accumulation steps ---disable_gradient_checkpointing - Disable gradient checkpointing ---lr LR Learning rate ---log {none,wandb,tensorboard} - Use experiment tracking ---tokens-column TOKENS_COLUMN - Tokens column to use. Must be a stringified list of tokens if using a CSV file. Default is 'tokens'. ---tags-column TAGS_COLUMN - Tags column to use. Must be a stringified list of tags if using a CSV file. Default is 'tags'. ---max-seq-length MAX_SEQ_LENGTH - Set the maximum sequence length (number of tokens) that the model should handle in a single input. Longer sequences are - truncated. Affects both memory usage and computational requirements. Default is 128 tokens. ---warmup-ratio WARMUP_RATIO - Define the proportion of training to be dedicated to a linear warmup where learning rate gradually increases. This can help - in stabilizing the training process early on. Default ratio is 0.1. ---optimizer OPTIMIZER - Choose the optimizer algorithm for training the model. Different optimizers can affect the training speed and model - performance. 'adamw_torch' is used by default. ---scheduler SCHEDULER - Select the learning rate scheduler to adjust the learning rate based on the number of epochs. 'linear' decreases the - learning rate linearly from the initial lr set. Default is 'linear'. Try 'cosine' for a cosine annealing schedule. ---weight-decay WEIGHT_DECAY - Set the weight decay rate to apply for regularization. Helps in preventing the model from overfitting by penalizing large - weights. Default is 0.0, meaning no weight decay is applied. ---max-grad-norm MAX_GRAD_NORM - Specify the maximum norm of the gradients for gradient clipping. Gradient clipping is used to prevent the exploding gradient - problem in deep neural networks. Default is 1.0. ---logging-steps LOGGING_STEPS - Determine how often to log training progress. Set this to the number of steps between each log output. -1 determines logging - steps automatically. Default is -1. ---eval-strategy {steps,epoch,no} - Specify how often to evaluate the model performance. Options include 'no', 'steps', 'epoch'. 'epoch' evaluates at the end of - each training epoch by default. ---save-total-limit SAVE_TOTAL_LIMIT - Limit the total number of model checkpoints to save. Helps manage disk space by retaining only the most recent checkpoints. - Default is to save only the latest one. ---auto-find-batch-size - Enable automatic batch size determination based on your hardware capabilities. When set, it tries to find the largest batch - size that fits in memory. ---mixed-precision {fp16,bf16,None} - Choose the precision mode for training to optimize performance and memory usage. Options are 'fp16', 'bf16', or None for - default precision. Default is None. -``` diff --git a/src/autotrain/app/api_routes.py b/src/autotrain/app/api_routes.py index 7ccd8c14ab..cd6e04bdb5 100644 --- a/src/autotrain/app/api_routes.py +++ b/src/autotrain/app/api_routes.py @@ -28,6 +28,23 @@ def create_api_base_model(base_class, class_name): + """ + Creates a new Pydantic model based on a given base class and class name, + excluding specified fields. + + Args: + base_class (Type): The base Pydantic model class to extend. + class_name (str): The name of the new model class to create. + + Returns: + Type: A new Pydantic model class with the specified modifications. + + Notes: + - The function uses type hints from the base class to define the new model's fields. + - Certain fields are excluded from the new model based on the class name. + - The function supports different sets of hidden parameters for different class names. + - The new model's configuration is set to have no protected namespaces. + """ annotations = get_type_hints(base_class) if class_name in ("LLMSFTTrainingParamsAPI", "LLMRewardTrainingParamsAPI"): more_hidden_params = [ @@ -206,6 +223,32 @@ class ExtractiveQuestionAnsweringColumnMapping(BaseModel): class APICreateProjectModel(BaseModel): + """ + APICreateProjectModel is a Pydantic model that defines the schema for creating a project. + + Attributes: + project_name (str): The name of the project. + task (Literal): The type of task for the project. Supported tasks include various LLM tasks, + image classification, dreambooth, seq2seq, token classification, text classification, + text regression, tabular classification, tabular regression, image regression, VLM tasks, + and extractive question answering. + base_model (str): The base model to be used for the project. + hardware (Literal): The type of hardware to be used for the project. Supported hardware options + include various configurations of spaces and local. + params (Union): The training parameters for the project. The type of parameters depends on the + task selected. + username (str): The username of the person creating the project. + column_mapping (Optional[Union]): The column mapping for the project. The type of column mapping + depends on the task selected. + hub_dataset (str): The dataset to be used for the project. + train_split (str): The training split of the dataset. + valid_split (Optional[str]): The validation split of the dataset. + + Methods: + validate_column_mapping(cls, values): Validates the column mapping based on the task selected. + validate_params(cls, values): Validates the training parameters based on the task selected. + """ + project_name: str task: Literal[ "llm:sft", @@ -530,6 +573,18 @@ def validate_params(cls, values): def api_auth(request: Request): + """ + Authenticates the API request using a Bearer token. + + Args: + request (Request): The incoming HTTP request object. + + Returns: + str: The verified Bearer token if authentication is successful. + + Raises: + HTTPException: If the token is invalid, expired, or missing. + """ authorization = request.headers.get("Authorization") if authorization: schema, _, token = authorization.partition(" ") @@ -553,9 +608,24 @@ def api_auth(request: Request): @api_router.post("/create_project", response_class=JSONResponse) async def api_create_project(project: APICreateProjectModel, token: bool = Depends(api_auth)): """ - This function is used to create a new project - :param project: APICreateProjectModel - :return: JSONResponse + Asynchronously creates a new project based on the provided parameters. + + Args: + project (APICreateProjectModel): The model containing the project details and parameters. + token (bool, optional): The authentication token. Defaults to Depends(api_auth). + + Returns: + dict: A dictionary containing a success message, the job ID of the created project, and a success status. + + Raises: + HTTPException: If there is an error during project creation. + + Notes: + - The function determines the hardware type based on the project hardware attribute. + - It logs the provided parameters and column mapping. + - It sets the appropriate parameters based on the task type. + - It updates the parameters with the provided ones and creates an AppParams instance. + - The function then creates an AutoTrainProject instance and initiates the project creation process. """ provided_params = project.params.model_dump() if project.hardware == "local": @@ -609,8 +679,13 @@ async def api_create_project(project: APICreateProjectModel, token: bool = Depen @api_router.get("/version", response_class=JSONResponse) async def api_version(): """ - This function is used to get the version of the API - :return: JSONResponse + Returns the current version of the API. + + This asynchronous function retrieves the version of the API from the + __version__ variable and returns it in a dictionary. + + Returns: + dict: A dictionary containing the API version. """ return {"version": __version__} @@ -618,9 +693,14 @@ async def api_version(): @api_router.get("/logs", response_class=JSONResponse) async def api_logs(job_id: str, token: bool = Depends(api_auth)): """ - This function is used to get the logs of a project - :param job_id: str - :return: JSONResponse + Fetch logs for a specific job. + + Args: + job_id (str): The ID of the job for which logs are to be fetched. + token (bool, optional): Authentication token, defaults to the result of api_auth dependency. + + Returns: + dict: A dictionary containing the logs, success status, and a message. """ # project = AutoTrainProject(job_id=job_id, token=token) # logs = project.get_logs() @@ -630,9 +710,22 @@ async def api_logs(job_id: str, token: bool = Depends(api_auth)): @api_router.get("/stop_training", response_class=JSONResponse) async def api_stop_training(job_id: str, token: bool = Depends(api_auth)): """ - This function is used to stop the training of a project - :param job_id: str - :return: JSONResponse + Stops the training job with the given job ID. + + This asynchronous function pauses the training job identified by the provided job ID. + It uses the Hugging Face API to pause the space associated with the job. + + Args: + job_id (str): The ID of the job to stop. + token (bool, optional): The authentication token, provided by dependency injection. + + Returns: + dict: A dictionary containing a message and a success flag. If the training job + was successfully stopped, the message indicates success and the success flag is True. + If there was an error, the message contains the error details and the success flag is False. + + Raises: + Exception: If there is an error while attempting to stop the training job. """ hf_api = HfApi(token=token) try: diff --git a/src/autotrain/app/app.py b/src/autotrain/app/app.py index 37e5785031..e6e155545f 100644 --- a/src/autotrain/app/app.py +++ b/src/autotrain/app/app.py @@ -26,6 +26,16 @@ @app.get("/") async def forward_to_ui(request: Request): + """ + Forwards the incoming request to the UI endpoint. + + Args: + request (Request): The incoming HTTP request. + + Returns: + RedirectResponse: A response object that redirects to the UI endpoint, + including any query parameters from the original request. + """ query_params = request.query_params url = "/ui/" if query_params: diff --git a/src/autotrain/app/db.py b/src/autotrain/app/db.py index cd8a4e5298..7933273537 100644 --- a/src/autotrain/app/db.py +++ b/src/autotrain/app/db.py @@ -2,6 +2,36 @@ class AutoTrainDB: + """ + A class to manage job records in a SQLite database. + + Attributes: + ----------- + db_path : str + The path to the SQLite database file. + conn : sqlite3.Connection + The SQLite database connection object. + c : sqlite3.Cursor + The SQLite database cursor object. + + Methods: + -------- + __init__(db_path): + Initializes the database connection and creates the jobs table if it does not exist. + + create_jobs_table(): + Creates the jobs table in the database if it does not exist. + + add_job(pid): + Adds a new job with the given process ID (pid) to the jobs table. + + get_running_jobs(): + Retrieves a list of all running job process IDs (pids) from the jobs table. + + delete_job(pid): + Deletes the job with the given process ID (pid) from the jobs table. + """ + def __init__(self, db_path): self.db_path = db_path self.conn = sqlite3.connect(db_path) diff --git a/src/autotrain/app/models.py b/src/autotrain/app/models.py index 717656eae9..4f42b0f5ec 100644 --- a/src/autotrain/app/models.py +++ b/src/autotrain/app/models.py @@ -4,6 +4,15 @@ def get_sorted_models(hub_models): + """ + Filters and sorts a list of models based on their download count. + + Args: + hub_models (list): A list of model objects. Each model object must have the attributes 'id', 'downloads', and 'private'. + + Returns: + list: A list of model IDs sorted by their download count in descending order. Only includes models that are not private. + """ hub_models = [{"id": m.id, "downloads": m.downloads} for m in hub_models if m.private is False] hub_models = sorted(hub_models, key=lambda x: x["downloads"], reverse=True) hub_models = [m["id"] for m in hub_models] @@ -11,6 +20,18 @@ def get_sorted_models(hub_models): def _fetch_text_classification_models(): + """ + Fetches and sorts text classification models from the Hugging Face model hub. + + This function retrieves models for the tasks "fill-mask" and "text-classification" + from the Hugging Face model hub, sorts them by the number of downloads, and combines + them into a single list. Additionally, it fetches trending models based on the number + of likes in the past 7 days, sorts them, and places them at the beginning of the list + if they are not already included. + + Returns: + list: A sorted list of model identifiers from the Hugging Face model hub. + """ hub_models1 = list( list_models( task="fill-mask", diff --git a/src/autotrain/app/oauth.py b/src/autotrain/app/oauth.py index d5738fd941..c03266e351 100644 --- a/src/autotrain/app/oauth.py +++ b/src/autotrain/app/oauth.py @@ -22,6 +22,18 @@ def attach_oauth(app: fastapi.FastAPI): + """ + Attaches OAuth functionality to a FastAPI application by adding OAuth routes and session middleware. + + Args: + app (fastapi.FastAPI): The FastAPI application instance to which OAuth routes and middleware will be attached. + + Notes: + - The session middleware requires a secret key to sign the cookies. A hash of the OAuth secret key is used to + make it unique to the Space and to ensure it is updated if the OAuth configuration changes. + - The session secret includes a version identifier ("-autotrain-v2") to allow for future changes in the session + cookie format. If the format changes, the version can be bumped to invalidate old cookies and prevent HTTP 500 errors. + """ _add_oauth_routes(app) # Session Middleware requires a secret key to sign the cookies. Let's use a hash # of the OAuth secret key to make it unique to the Space + updated in case OAuth @@ -38,6 +50,23 @@ def attach_oauth(app: fastapi.FastAPI): def _add_oauth_routes(app: fastapi.FastAPI) -> None: + """ + Add OAuth routes to the FastAPI app (login, callback handler, and logout). + + This function performs the following tasks: + 1. Checks for required environment variables and raises a ValueError if any are missing. + 2. Registers the OAuth server with the provided client ID, client secret, scopes, and OpenID provider URL. + 3. Defines the following OAuth routes: + - `/login/huggingface`: Redirects to the Hugging Face OAuth page. + - `/auth`: Handles the OAuth callback and manages the OAuth state. + + Args: + app (fastapi.FastAPI): The FastAPI application instance to which the OAuth routes will be added. + + Raises: + ValueError: If any of the required environment variables (OAUTH_CLIENT_ID, OAUTH_CLIENT_SECRET, + OAUTH_SCOPES, OPENID_PROVIDER_URL) are not set. + """ """Add OAuth routes to the FastAPI app (login, callback handler and logout).""" # Check environment variables msg = ( @@ -66,6 +95,15 @@ def _add_oauth_routes(app: fastapi.FastAPI) -> None: # Define OAuth routes @app.get("/login/huggingface") async def oauth_login(request: fastapi.Request): + """ + Handles the OAuth login process by redirecting to the Hugging Face OAuth page. + + Args: + request (fastapi.Request): The incoming HTTP request. + + Returns: + Response: A redirection response to the Hugging Face OAuth authorization page. + """ """Endpoint that redirects to HF OAuth page.""" redirect_uri = request.url_for("auth") redirect_uri_as_str = str(redirect_uri) @@ -75,6 +113,25 @@ async def oauth_login(request: fastapi.Request): @app.get("/auth") async def auth(request: fastapi.Request) -> RedirectResponse: + """ + Handles the OAuth callback for Hugging Face authentication. + + Args: + request (fastapi.Request): The incoming request object. + + Returns: + RedirectResponse: A response object that redirects the user to the appropriate page. + + Raises: + MismatchingStateError: If there is a state mismatch, likely due to a corrupted cookie. + In this case, the user is redirected to the login page after clearing the relevant session keys. + + Notes: + - If the state mismatch occurs, it is likely due to a bug in authlib that causes the token to grow indefinitely + if the user tries to login repeatedly. Since cookies cannot exceed 4kb, the token will be truncated at some point, + resulting in a lost state. The workaround is to delete the cookie and redirect the user to the login page again. + - See https://github.com/lepture/authlib/issues/622 for more details. + """ """Endpoint that handles the OAuth callback.""" try: oauth_info = await oauth.huggingface.authorize_access_token(request) # type: ignore @@ -99,23 +156,17 @@ async def auth(request: fastapi.Request) -> RedirectResponse: return _redirect_to_target(request) -def _generate_redirect_uri(request: fastapi.Request) -> str: - if "_target_url" in request.query_params: - # if `_target_url` already in query params => respect it - target = request.query_params["_target_url"] - else: - # otherwise => keep query params - target = "/?" + urllib.parse.urlencode(request.query_params) - - redirect_uri = request.url_for("oauth_redirect_callback").include_query_params(_target_url=target) - redirect_uri_as_str = str(redirect_uri) - if redirect_uri.netloc.endswith(".hf.space"): - # In Space, FastAPI redirect as http but we want https - redirect_uri_as_str = redirect_uri_as_str.replace("http://", "https://") - return redirect_uri_as_str +def _redirect_to_target(request: fastapi.Request, default_target: str = "/") -> RedirectResponse: + """ + Redirects the incoming request to a target URL specified in the query parameters. + Args: + request (fastapi.Request): The incoming HTTP request. + default_target (str, optional): The default URL to redirect to if no target URL is specified in the query parameters. Defaults to "/". -def _redirect_to_target(request: fastapi.Request, default_target: str = "/") -> RedirectResponse: + Returns: + RedirectResponse: A response object that redirects the client to the target URL. + """ target = request.query_params.get("_target_url", default_target) # target = "https://huggingface.co/spaces/" + os.environ.get("SPACE_ID") return RedirectResponse(target) diff --git a/src/autotrain/app/params.py b/src/autotrain/app/params.py index 586df83e34..d4f8171817 100644 --- a/src/autotrain/app/params.py +++ b/src/autotrain/app/params.py @@ -154,6 +154,42 @@ @dataclass class AppParams: + """ + AppParams class is responsible for managing and processing parameters for various machine learning tasks. + + Attributes: + job_params_json (str): JSON string containing job parameters. + token (str): Authentication token. + project_name (str): Name of the project. + username (str): Username of the project owner. + task (str): Type of task to be performed. + data_path (str): Path to the dataset. + base_model (str): Base model to be used. + column_mapping (dict): Mapping of columns for the dataset. + train_split (Optional[str]): Name of the training split. Default is None. + valid_split (Optional[str]): Name of the validation split. Default is None. + using_hub_dataset (Optional[bool]): Flag indicating if a hub dataset is used. Default is False. + api (Optional[bool]): Flag indicating if API is used. Default is False. + + Methods: + __post_init__(): Validates the parameters after initialization. + munge(): Processes the parameters based on the task type. + _munge_common_params(): Processes common parameters for all tasks. + _munge_params_sent_transformers(): Processes parameters for sentence transformers task. + _munge_params_llm(): Processes parameters for large language model task. + _munge_params_vlm(): Processes parameters for vision-language model task. + _munge_params_text_clf(): Processes parameters for text classification task. + _munge_params_extractive_qa(): Processes parameters for extractive question answering task. + _munge_params_text_reg(): Processes parameters for text regression task. + _munge_params_token_clf(): Processes parameters for token classification task. + _munge_params_seq2seq(): Processes parameters for sequence-to-sequence task. + _munge_params_img_clf(): Processes parameters for image classification task. + _munge_params_img_reg(): Processes parameters for image regression task. + _munge_params_img_obj_det(): Processes parameters for image object detection task. + _munge_params_tabular(): Processes parameters for tabular data task. + _munge_params_dreambooth(): Processes parameters for DreamBooth training task. + """ + job_params_json: str token: str project_name: str @@ -472,6 +508,21 @@ def _munge_params_dreambooth(self): def get_task_params(task, param_type): + """ + Retrieve task-specific parameters while filtering out hidden parameters based on the task and parameter type. + + Args: + task (str): The task identifier, which can include prefixes like "llm", "st:", "vlm:", etc. + param_type (str): The type of parameters to retrieve, typically "basic" or other types. + + Returns: + dict: A dictionary of task-specific parameters with hidden parameters filtered out. + + Notes: + - The function handles various task prefixes and adjusts the task and trainer variables accordingly. + - Hidden parameters are filtered out based on the task and parameter type. + - Additional hidden parameters are defined for specific tasks and trainers. + """ if task.startswith("llm"): trainer = task.split(":")[1].lower() task = task.split(":")[0].lower() diff --git a/src/autotrain/app/training_api.py b/src/autotrain/app/training_api.py index 716fc1f8da..3fe19b8685 100644 --- a/src/autotrain/app/training_api.py +++ b/src/autotrain/app/training_api.py @@ -23,6 +23,15 @@ def graceful_exit(signum, frame): + """ + Handles the SIGTERM signal to perform cleanup and exit the program gracefully. + + Args: + signum (int): The signal number. + frame (FrameType): The current stack frame (or None). + + Logs a message indicating that SIGTERM was received and then exits the program with status code 0. + """ logger.info("SIGTERM received. Performing cleanup...") sys.exit(0) @@ -31,6 +40,15 @@ def graceful_exit(signum, frame): class BackgroundRunner: + """ + A class to handle background running tasks. + + Methods + ------- + run_main(): + Continuously checks for running jobs and shuts down the server if no jobs are found. + """ + async def run_main(self): while True: running_jobs = get_running_jobs(DB) @@ -45,6 +63,21 @@ async def run_main(self): @asynccontextmanager async def lifespan(app: FastAPI): + """ + Manages the lifespan of the FastAPI application. + + This function is responsible for starting the training process and + managing a background task runner. It logs the process ID of the + training job, adds the job to the database, and ensures the background + task is properly cancelled when the application shuts down. + + Args: + app (FastAPI): The FastAPI application instance. + + Yields: + None: This function is a generator that yields control back to the + FastAPI application lifecycle. + """ process_pid = run_training(params=PARAMS, task_id=TASK_ID) logger.info(f"Started training with PID {process_pid}") DB.add_job(process_pid) diff --git a/src/autotrain/app/ui_routes.py b/src/autotrain/app/ui_routes.py index f66194d12a..fcd861d754 100644 --- a/src/autotrain/app/ui_routes.py +++ b/src/autotrain/app/ui_routes.py @@ -288,6 +288,19 @@ def graceful_exit(signum, frame): + """ + Handles the SIGTERM signal to perform cleanup and exit the program gracefully. + + Args: + signum (int): The signal number. + frame (FrameType): The current stack frame (or None). + + Logs: + Logs the receipt of the SIGTERM signal and the initiation of cleanup. + + Exits: + Exits the program with status code 0. + """ logger.info("SIGTERM received. Performing cleanup...") sys.exit(0) @@ -299,6 +312,23 @@ def graceful_exit(signum, frame): def user_authentication(request: Request): + """ + Authenticates the user based on the following priority: + 1. HF_TOKEN environment variable + 2. OAuth information in session + 3. Token in bearer header (not implemented in the given code) + + Args: + request (Request): The incoming HTTP request object. + + Returns: + str: The authenticated token if verification is successful. + + Raises: + HTTPException: If the token is invalid or expired and the application is not running in a space. + + If the application is running in a space and authentication fails, it returns a login template response. + """ # priority: hf_token env var > oauth_info in session > token in bearer header # if "oauth_info" in request.session: if HF_TOKEN is not None: @@ -474,20 +504,28 @@ async def handle_form( token: str = Depends(user_authentication), ): """ - This function is used to create a new project - :param project_name: str - :param task: str - :param base_model: str - :param hardware: str - :param params: str - :param autotrain_user: str - :param column_mapping: str - :param data_files_training: List[UploadFile] - :param data_files_valid: List[UploadFile] - :param hub_dataset: str - :param train_split: str - :param valid_split: str - :return: JSONResponse + Handle form submission for creating and managing AutoTrain projects. + + Args: + project_name (str): The name of the project. + task (str): The task type (e.g., "image-classification", "text-classification"). + base_model (str): The base model to use for training. + hardware (str): The hardware configuration (e.g., "local-ui"). + params (str): JSON string of additional parameters. + autotrain_user (str): The username of the AutoTrain user. + column_mapping (str): JSON string mapping columns to their roles. + data_files_training (List[UploadFile]): List of training data files. + data_files_valid (List[UploadFile]): List of validation data files. + hub_dataset (str): The Hugging Face Hub dataset identifier. + train_split (str): The training split identifier. + valid_split (str): The validation split identifier. + token (str): The authentication token. + + Returns: + dict: A dictionary containing the success status and monitor URL. + + Raises: + HTTPException: If there are conflicts or validation errors in the form submission. """ train_split = train_split.strip() if len(train_split) == 0: diff --git a/src/autotrain/app/utils.py b/src/autotrain/app/utils.py index e4198af269..55f6d6a6ff 100644 --- a/src/autotrain/app/utils.py +++ b/src/autotrain/app/utils.py @@ -17,6 +17,20 @@ def graceful_exit(signum, frame): def get_running_jobs(db): + """ + Retrieves and manages running jobs from the database. + + This function fetches the list of running jobs from the provided database object. + For each running job, it checks the process status. If the status is "completed", + "error", or "zombie", it attempts to kill the process and remove the job from the + database. After processing, it fetches and returns the updated list of running jobs. + + Args: + db: A database object that provides methods to get and delete running jobs. + + Returns: + list: An updated list of running jobs from the database. + """ running_jobs = db.get_running_jobs() if running_jobs: for _pid in running_jobs: @@ -36,6 +50,18 @@ def get_running_jobs(db): def get_process_status(pid): + """ + Retrieve the status of a process given its PID. + + Args: + pid (int): The process ID of the process to check. + + Returns: + str: The status of the process. If the process does not exist, returns "Completed". + + Raises: + psutil.NoSuchProcess: If no process with the given PID is found. + """ try: process = psutil.Process(pid) proc_status = process.status() @@ -46,7 +72,19 @@ def get_process_status(pid): def kill_process_by_pid(pid): - """Kill process by PID.""" + """ + Kill a process by its PID (Process ID). + + This function attempts to terminate a process with the given PID using the SIGTERM signal. + It logs the outcome of the operation, whether successful or not. + + Args: + pid (int): The Process ID of the process to be terminated. + + Raises: + ProcessLookupError: If no process with the given PID is found. + Exception: If an error occurs while attempting to send the SIGTERM signal. + """ try: os.kill(pid, signal.SIGTERM) logger.info(f"Sent SIGTERM to process with PID {pid}") @@ -57,6 +95,22 @@ def kill_process_by_pid(pid): def token_verification(token): + """ + Verifies the provided token with the Hugging Face API and retrieves user information. + + Args: + token (str): The token to be verified. It can be either an OAuth token (starting with "hf_oauth") + or a regular token (starting with "hf_"). + + Returns: + dict: A dictionary containing user information with the following keys: + - id (str): The user ID. + - name (str): The user's preferred username. + - orgs (list): A list of organizations the user belongs to. + + Raises: + Exception: If the Hugging Face Hub is unreachable or the token is invalid. + """ if token.startswith("hf_oauth"): _api_url = config.HF_API + "/oauth/userinfo" _err_msg = "/oauth/userinfo" @@ -99,6 +153,18 @@ def token_verification(token): def get_user_and_orgs(user_token): + """ + Retrieve the username and organizations associated with the provided user token. + + Args: + user_token (str): The token used to authenticate the user. Must be a valid write token. + + Returns: + list: A list containing the username followed by the organizations the user belongs to. + + Raises: + Exception: If the user token is None or an empty string. + """ if user_token is None: raise Exception("Please login with a write token.") diff --git a/src/autotrain/backends/base.py b/src/autotrain/backends/base.py index e9d34e69ff..418a93a375 100644 --- a/src/autotrain/backends/base.py +++ b/src/autotrain/backends/base.py @@ -62,6 +62,24 @@ @dataclass class BaseBackend: + """ + BaseBackend class is responsible for initializing and validating backend configurations + for various training parameters. It supports multiple types of training parameters + including text classification, image classification, LLM training, and more. + + Attributes: + params (Union[TextClassificationParams, ImageClassificationParams, LLMTrainingParams, + GenericParams, TabularParams, DreamBoothTrainingParams, Seq2SeqParams, + TokenClassificationParams, TextRegressionParams, ObjectDetectionParams, + SentenceTransformersParams, ImageRegressionParams, VLMTrainingParams, + ExtractiveQuestionAnsweringParams]): Training parameters. + backend (str): Backend type. + + Methods: + __post_init__(): Initializes the backend configuration, validates parameters, + sets task IDs, and prepares environment variables. + """ + params: Union[ TextClassificationParams, ImageClassificationParams, diff --git a/src/autotrain/backends/endpoints.py b/src/autotrain/backends/endpoints.py index 31f8293863..84078f01e4 100644 --- a/src/autotrain/backends/endpoints.py +++ b/src/autotrain/backends/endpoints.py @@ -7,6 +7,33 @@ class EndpointsRunner(BaseBackend): + """ + EndpointsRunner is responsible for creating and managing endpoint instances. + + Methods + ------- + create(): + Creates an endpoint instance with the specified hardware and model parameters. + + create() Method + --------------- + Creates an endpoint instance with the specified hardware and model parameters. + + Parameters + ---------- + None + + Returns + ------- + str + The name of the created endpoint instance. + + Raises + ------ + requests.exceptions.RequestException + If there is an issue with the HTTP request. + """ + def create(self): hardware = self.available_hardware[self.backend] accelerator = hardware.split("_")[2] diff --git a/src/autotrain/backends/local.py b/src/autotrain/backends/local.py index bfaa3a149b..442398141a 100644 --- a/src/autotrain/backends/local.py +++ b/src/autotrain/backends/local.py @@ -4,6 +4,18 @@ class LocalRunner(BaseBackend): + """ + LocalRunner is a class that inherits from BaseBackend and is responsible for managing local training tasks. + + Methods: + create(): + Starts the local training process by retrieving parameters and task ID from environment variables. + Logs the start of the training process. + Runs the training with the specified parameters and task ID. + If the `wait` attribute is False, logs the training process ID (PID). + Returns the training process ID (PID). + """ + def create(self): logger.info("Starting local training...") params = self.env_vars["PARAMS"] diff --git a/src/autotrain/backends/ngc.py b/src/autotrain/backends/ngc.py index 74856bc24c..35cb49d5e9 100644 --- a/src/autotrain/backends/ngc.py +++ b/src/autotrain/backends/ngc.py @@ -18,6 +18,34 @@ class NGCRunner(BaseBackend): + """ + NGCRunner class for managing NGC backend trainings. + + Methods: + _user_authentication_ngc(): + Authenticates the user with NGC and retrieves an authentication token. + Returns: + str: The authentication token. + Raises: + Exception: If an HTTP error or connection error occurs during the request. + + _create_ngc_job(token, url, payload): + Creates a job on NGC using the provided token, URL, and payload. + Args: + token (str): The authentication token. + url (str): The URL for the NGC API endpoint. + payload (dict): The payload containing job details. + Returns: + str: The ID of the created job. + Raises: + Exception: If an HTTP error or connection error occurs during the request. + + create(): + Creates a job on NGC with the specified parameters. + Returns: + str: The ID of the created job. + """ + def _user_authentication_ngc(self): logger.info("Authenticating NGC user...") scope = "group/ngc" diff --git a/src/autotrain/backends/nvcf.py b/src/autotrain/backends/nvcf.py index aee8681ed2..fc0bb445ca 100644 --- a/src/autotrain/backends/nvcf.py +++ b/src/autotrain/backends/nvcf.py @@ -13,6 +13,24 @@ class NVCFRunner(BaseBackend): + """ + NVCFRunner is a backend class responsible for managing and executing NVIDIA NVCF jobs. + + Methods + ------- + _convert_dict_to_object(dictionary): + Recursively converts a dictionary to an object using SimpleNamespace. + + _conf_nvcf(token, nvcf_type, url, job_name, method="POST", payload=None): + Configures and submits an NVCF job using the specified parameters. + + _poll_nvcf(url, token, job_name, method="get", timeout=86400, interval=30, op="poll"): + Polls the status of an NVCF job until completion or timeout. + + create(): + Initiates the creation and polling of an NVCF job. + """ + def _convert_dict_to_object(self, dictionary): if isinstance(dictionary, dict): for key, value in dictionary.items(): diff --git a/src/autotrain/backends/spaces.py b/src/autotrain/backends/spaces.py index 3a48fbcc46..cb6eaf00a7 100644 --- a/src/autotrain/backends/spaces.py +++ b/src/autotrain/backends/spaces.py @@ -18,6 +18,21 @@ class SpaceRunner(BaseBackend): + """ + SpaceRunner is a backend class responsible for creating and managing training jobs on Hugging Face Spaces. + + Methods + ------- + _create_readme(): + Creates a README.md file content for the space. + + _add_secrets(api, space_id): + Adds necessary secrets to the space repository. + + create(): + Creates a new space repository, adds secrets, and uploads necessary files. + """ + def _create_readme(self): _readme = "---\n" _readme += f"title: {self.params.project_name}\n" diff --git a/src/autotrain/cli/run_api.py b/src/autotrain/cli/run_api.py index 87bd5b3a57..6edfd5d6bc 100644 --- a/src/autotrain/cli/run_api.py +++ b/src/autotrain/cli/run_api.py @@ -12,6 +12,23 @@ def run_api_command_factory(args): class RunAutoTrainAPICommand(BaseAutoTrainCommand): + """ + Command to run the AutoTrain API. + + This command sets up and runs the AutoTrain API using the specified host and port. + + Methods + ------- + register_subcommand(parser: ArgumentParser) + Registers the 'api' subcommand and its arguments to the provided parser. + + __init__(port: int, host: str, task: str) + Initializes the command with the specified port, host, and task. + + run() + Runs the AutoTrain API using the uvicorn server. + """ + @staticmethod def register_subcommand(parser: ArgumentParser): run_api_parser = parser.add_parser( diff --git a/src/autotrain/cli/run_app.py b/src/autotrain/cli/run_app.py index bc76d0fb46..bdf0ea1969 100644 --- a/src/autotrain/cli/run_app.py +++ b/src/autotrain/cli/run_app.py @@ -11,6 +11,17 @@ def handle_output(stream, log_file): + """ + Continuously reads lines from a given stream and writes them to both + standard output and a log file until the stream is exhausted. + + Args: + stream (io.TextIOBase): The input stream to read lines from. + log_file (io.TextIOBase): The log file to write lines to. + + Returns: + None + """ while True: line = stream.readline() if not line: @@ -26,6 +37,25 @@ def run_app_command_factory(args): class RunAutoTrainAppCommand(BaseAutoTrainCommand): + """ + Command to run the AutoTrain application. + + This command sets up and runs the AutoTrain application with the specified + configuration options such as port, host, number of workers, and sharing options. + + Methods + ------- + register_subcommand(parser: ArgumentParser): + Registers the subcommand and its arguments to the provided parser. + + __init__(port: int, host: str, share: bool, workers: int, colab: bool): + Initializes the command with the specified parameters. + + run(): + Executes the command to run the AutoTrain application. Handles different + modes such as running in Colab or sharing via ngrok. + """ + @staticmethod def register_subcommand(parser: ArgumentParser): run_app_parser = parser.add_parser( diff --git a/src/autotrain/commands.py b/src/autotrain/commands.py index b6d4836031..c3dfd34170 100644 --- a/src/autotrain/commands.py +++ b/src/autotrain/commands.py @@ -38,15 +38,22 @@ def get_accelerate_command(num_gpus, gradient_accumulation_steps=1, distributed_backend=None): """ - Returns the accelerate command based on the number of GPUs available. + Generates the appropriate command to launch a training job using the `accelerate` library based on the number of GPUs + and the specified distributed backend. Args: - num_gpus: Number of GPUs available. - gradient_accumulation_steps: Number of gradient accumulation steps. - distributed_backend: Distributed backend to use: ddp, deepspeed, None. + num_gpus (int): The number of GPUs available for training. If 0, training will be forced on CPU. + gradient_accumulation_steps (int, optional): The number of gradient accumulation steps. Defaults to 1. + distributed_backend (str, optional): The distributed backend to use. Can be "ddp" (Distributed Data Parallel), + "deepspeed", or None. Defaults to None. Returns: - List: Accelerate command. + list or str: The command to be executed as a list of strings. If no GPU is found, returns a CPU command string. + If a single GPU is found, returns a single GPU command string. Otherwise, returns a list of + command arguments for multi-GPU or DeepSpeed training. + + Raises: + ValueError: If an unsupported distributed backend is specified. """ if num_gpus == 0: logger.warning("No GPU found. Forcing training on CPU. This will be super slow!") @@ -91,15 +98,30 @@ def get_accelerate_command(num_gpus, gradient_accumulation_steps=1, distributed_ def launch_command(params): """ - Launches training command based on the given parameters. + Launches the appropriate training command based on the type of training parameters provided. Args: - params: An instance of a parameter class (LLMTrainingParams, DreamBoothTrainingParams, GenericParams, TabularParams, - TextClassificationParams, TextRegressionParams, TokenClassificationParams, ImageClassificationParams, - ObjectDetectionParams, Seq2SeqParams). + params (object): An instance of one of the training parameter classes. This can be one of the following: + - LLMTrainingParams + - DreamBoothTrainingParams + - GenericParams + - TabularParams + - TextClassificationParams + - TextRegressionParams + - SentenceTransformersParams + - ExtractiveQuestionAnsweringParams + - TokenClassificationParams + - ImageClassificationParams + - ObjectDetectionParams + - ImageRegressionParams + - Seq2SeqParams + - VLMTrainingParams Returns: - None + list: A list of command line arguments to be executed for training. + + Raises: + ValueError: If the provided params type is unsupported. """ params.project_name = shlex.split(params.project_name)[0] diff --git a/src/autotrain/dataset.py b/src/autotrain/dataset.py index a98fa1afde..ab69abbdc9 100644 --- a/src/autotrain/dataset.py +++ b/src/autotrain/dataset.py @@ -34,6 +34,19 @@ def remove_non_image_files(folder): + """ + Remove non-image files from a specified folder and its subfolders. + + This function iterates through all files in the given folder and its subfolders, + and removes any file that does not have an allowed image file extension. The allowed + extensions are: .jpg, .jpeg, .png, .JPG, .JPEG, .PNG, and .jsonl. + + Args: + folder (str): The path to the folder from which non-image files should be removed. + + Returns: + None + """ # Define allowed image file extensions allowed_extensions = {".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG", ".jsonl"} @@ -56,6 +69,31 @@ def remove_non_image_files(folder): @dataclass class AutoTrainDreamboothDataset: + """ + AutoTrainDreamboothDataset prepares dataset for Dreambooth task. + + Attributes: + concept_images (List[Any]): A list of images related to the concept. + concept_name (str): The name of the concept. + token (str): The token associated with the concept. + project_name (str): The name of the project. + username (Optional[str]): The username of the person associated with the project. Defaults to None. + local (bool): A flag indicating whether the dataset is local. Defaults to False. + + Methods: + __str__() -> str: + Returns a string representation of the dataset, including the project name and task. + + __post_init__(): + Initializes the task attribute to "dreambooth". + + num_samples() -> int: + Returns the number of samples in the concept_images list. + + prepare(): + Prepares the dataset using the DreamboothPreprocessor and returns the preprocessed data. + """ + concept_images: List[Any] concept_name: str token: str @@ -88,6 +126,29 @@ def prepare(self): @dataclass class AutoTrainImageClassificationDataset: + """ + A class to handle image classification datasets for AutoTrain. + + Attributes: + train_data (str): Path to the training data. + token (str): Authentication token. + project_name (str): Name of the project. + username (str): Username of the project owner. + valid_data (Optional[str]): Path to the validation data. Default is None. + percent_valid (Optional[float]): Percentage of training data to use for validation. Default is None. + local (bool): Flag to indicate if the data is local. Default is False. + + Methods: + __str__() -> str: + Returns a string representation of the dataset. + + __post_init__(): + Initializes the dataset and sets default values for validation data. + + prepare(): + Prepares the dataset for training by extracting and preprocessing the data. + """ + train_data: str token: str project_name: str @@ -164,6 +225,29 @@ def prepare(self): @dataclass class AutoTrainObjectDetectionDataset: + """ + A dataset class for AutoTrain object detection tasks. + + Attributes: + train_data (str): Path to the training data. + token (str): Authentication token. + project_name (str): Name of the project. + username (str): Username of the project owner. + valid_data (Optional[str]): Path to the validation data. Default is None. + percent_valid (Optional[float]): Percentage of training data to be used for validation. Default is None. + local (bool): Flag indicating if the data is local. Default is False. + + Methods: + __str__() -> str: + Returns a string representation of the dataset. + + __post_init__(): + Initializes the dataset and sets default values for validation data. + + prepare(): + Prepares the dataset for training by extracting and preprocessing the data. + """ + train_data: str token: str project_name: str @@ -240,6 +324,40 @@ def prepare(self): @dataclass class AutoTrainVLMDataset: + """ + A class to handle dataset for AutoTrain Vision-Language Model (VLM) task. + + Attributes: + ----------- + train_data : str + Path to the training data or a file-like object containing the training data. + token : str + Authentication token for accessing the dataset. + project_name : str + Name of the project. + username : str + Username of the project owner. + column_mapping : Dict[str, str] + Mapping of columns in the dataset. + valid_data : Optional[str], default=None + Path to the validation data or a file-like object containing the validation data. + percent_valid : Optional[float], default=None + Percentage of the training data to be used for validation if `valid_data` is not provided. + local : bool, default=False + Flag indicating whether the dataset is stored locally. + + Methods: + -------- + __str__() -> str: + Returns a string representation of the dataset. + + __post_init__(): + Initializes the dataset and sets default values for validation data percentage. + + prepare(): + Prepares the dataset for training by extracting and processing the data. + """ + train_data: str token: str project_name: str @@ -318,6 +436,29 @@ def prepare(self): @dataclass class AutoTrainImageRegressionDataset: + """ + AutoTrainImageRegressionDataset is a class designed for handling image regression datasets in the AutoTrain framework. + + Attributes: + train_data (str): Path to the training data. + token (str): Authentication token. + project_name (str): Name of the project. + username (str): Username of the project owner. + valid_data (Optional[str]): Path to the validation data. Default is None. + percent_valid (Optional[float]): Percentage of training data to be used for validation if valid_data is not provided. Default is None. + local (bool): Flag indicating if the data is local. Default is False. + + Methods: + __str__() -> str: + Returns a string representation of the dataset information. + + __post_init__(): + Initializes the task attribute and sets the percent_valid attribute based on the presence of valid_data. + + prepare(): + Prepares the dataset for training by extracting and organizing the data, and returns a preprocessor object. + """ + train_data: str token: str project_name: str @@ -394,6 +535,30 @@ def prepare(self): @dataclass class AutoTrainDataset: + """ + AutoTrainDataset class for handling various types of datasets and preprocessing tasks. + + Attributes: + train_data (List[str]): List of file paths or DataFrames for training data. + task (str): The type of task to perform (e.g., "text_binary_classification"). + token (str): Authentication token. + project_name (str): Name of the project. + username (Optional[str]): Username of the project owner. Defaults to None. + column_mapping (Optional[Dict[str, str]]): Mapping of column names. Defaults to None. + valid_data (Optional[List[str]]): List of file paths or DataFrames for validation data. Defaults to None. + percent_valid (Optional[float]): Percentage of training data to use for validation. Defaults to None. + convert_to_class_label (Optional[bool]): Whether to convert labels to class labels. Defaults to False. + local (bool): Whether the data is local. Defaults to False. + ext (Optional[str]): File extension of the data files. Defaults to "csv". + + Methods: + __str__(): Returns a string representation of the dataset. + __post_init__(): Initializes validation data and preprocesses the data. + _preprocess_data(): Preprocesses the training and validation data. + num_samples(): Returns the total number of samples in the dataset. + prepare(): Prepares the dataset for the specified task using the appropriate preprocessor. + """ + train_data: List[str] task: str token: str diff --git a/src/autotrain/logging.py b/src/autotrain/logging.py index 6492eb08e9..24d0586ae2 100644 --- a/src/autotrain/logging.py +++ b/src/autotrain/logging.py @@ -7,6 +7,24 @@ @dataclass class Logger: + """ + A custom logger class that sets up and manages logging configuration. + + Methods + ------- + __post_init__(): + Initializes the logger with a specific format and sets up the logger. + + _should_log(record): + Determines if a log record should be logged based on the process state. + + setup_logger(): + Configures the logger to output to stdout with the specified format and filter. + + get_logger(): + Returns the configured logger instance. + """ + def __post_init__(self): self.log_format = ( "{level: <8} | " diff --git a/src/autotrain/parser.py b/src/autotrain/parser.py index 503e21c06b..f5e7a56486 100644 --- a/src/autotrain/parser.py +++ b/src/autotrain/parser.py @@ -39,6 +39,28 @@ @dataclass class AutoTrainConfigParser: + """ + AutoTrainConfigParser is a class responsible for parsing and validating the yaml configuration + required to run various tasks in the AutoTrain framework. It supports loading configurations + from both local files and remote URLs, and maps task aliases to their respective parameters + and data munging functions. + + Attributes: + config_path (str): Path or URL to the configuration file. + config (dict): Parsed configuration data. + task_param_map (dict): Mapping of task names to their parameter classes. + munge_data_map (dict): Mapping of task names to their data munging functions. + task_aliases (dict): Mapping of task aliases to their canonical task names. + task (str): The resolved task name from the configuration. + backend (str): The backend specified in the configuration. + parsed_config (dict): The parsed configuration parameters. + + Methods: + __post_init__(): Initializes the parser, loads the configuration, and validates required fields. + _parse_config(): Parses the configuration and extracts relevant parameters based on the task. + run(): Executes the task with the parsed configuration. + """ + config_path: str def __post_init__(self): diff --git a/src/autotrain/preprocessor/dreambooth.py b/src/autotrain/preprocessor/dreambooth.py index bd476a4ada..e6beeb1401 100644 --- a/src/autotrain/preprocessor/dreambooth.py +++ b/src/autotrain/preprocessor/dreambooth.py @@ -11,6 +11,26 @@ @dataclass class DreamboothPreprocessor: + """ + DreamboothPreprocessor is a class responsible for preparing concept images and prompts data for DreamBooth Task. + + Attributes: + concept_images (List[Any]): A list of concept images to be processed. + concept_name (str): The name of the concept. + username (str): The username of the person creating the project. + project_name (str): The name of the project. + token (str): The authentication token for accessing the repository. + local (bool): A flag indicating whether the processing is local or remote. + + Methods: + __post_init__(): Initializes the repository name and creates a remote repository if not local. + _upload_concept_images(file, api): Uploads a concept image to the remote repository. + _upload_concept_prompts(api): Uploads the concept prompts to the remote repository. + _save_concept_images(file): Saves a concept image locally. + _save_concept_prompts(): Saves the concept prompts locally. + prepare(): Prepares the concept images and prompts by either saving them locally or uploading them to a remote repository. + """ + concept_images: List[Any] concept_name: str username: str diff --git a/src/autotrain/preprocessor/tabular.py b/src/autotrain/preprocessor/tabular.py index 0657281e84..defcb3c4a7 100644 --- a/src/autotrain/preprocessor/tabular.py +++ b/src/autotrain/preprocessor/tabular.py @@ -11,6 +11,28 @@ @dataclass class TabularBinaryClassificationPreprocessor: + """ + A preprocessor class for tabular binary classification tasks. + + Attributes: + train_data (pd.DataFrame): The training data. + label_column (str): The name of the label column in the training data. + username (str): The username for the Hugging Face Hub. + project_name (str): The name of the project. + token (str): The authentication token for the Hugging Face Hub. + id_column (Optional[str]): The name of the ID column in the training data. Default is None. + valid_data (Optional[pd.DataFrame]): The validation data. Default is None. + test_size (Optional[float]): The proportion of the dataset to include in the validation split. Default is 0.2. + seed (Optional[int]): The random seed for splitting the data. Default is 42. + local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Default is False. + + Methods: + __post_init__(): Validates the presence of required columns in the training and validation data. + split(): Splits the training data into training and validation sets if validation data is not provided. + prepare_columns(train_df, valid_df): Prepares the columns by adding 'autotrain_id' and 'autotrain_label', and drops the original ID and label columns. + prepare(): Prepares the dataset by splitting, processing columns, and saving or pushing the dataset to the Hugging Face Hub. + """ + train_data: pd.DataFrame label_column: str username: str @@ -125,6 +147,28 @@ def split(self): @dataclass class TabularMultiLabelClassificationPreprocessor: + """ + TabularMultiLabelClassificationPreprocessor is a class for preprocessing tabular data for multi-label classification tasks. + + Attributes: + train_data (pd.DataFrame): The training data. + label_column (List[str]): List of columns to be used as labels. + username (str): The username for the Hugging Face Hub. + project_name (str): The project name for the Hugging Face Hub. + id_column (Optional[str]): The column to be used as an identifier. Defaults to None. + valid_data (Optional[pd.DataFrame]): The validation data. Defaults to None. + test_size (Optional[float]): The proportion of the dataset to include in the validation split. Defaults to 0.2. + seed (Optional[int]): The random seed for splitting the data. Defaults to 42. + token (Optional[str]): The token for authentication with the Hugging Face Hub. Defaults to None. + local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Defaults to False. + + Methods: + __post_init__(): Validates the presence of id_column and label_column in train_data and valid_data, and checks for reserved column names. + split(): Splits the train_data into training and validation sets if valid_data is not provided. + prepare_columns(train_df, valid_df): Prepares the columns by adding autotrain_id and autotrain_label columns, and drops the original id_column and label_column. + prepare(): Prepares the dataset by splitting the data, preparing the columns, and converting to Hugging Face Dataset format. Saves the dataset locally or pushes to the Hugging Face Hub. + """ + train_data: pd.DataFrame label_column: List[str] username: str diff --git a/src/autotrain/preprocessor/text.py b/src/autotrain/preprocessor/text.py index f246a4b888..0c2a991818 100644 --- a/src/autotrain/preprocessor/text.py +++ b/src/autotrain/preprocessor/text.py @@ -20,6 +20,29 @@ @dataclass class TextBinaryClassificationPreprocessor: + """ + A preprocessor class for binary text classification tasks. + + Attributes: + train_data (pd.DataFrame): The training data. + text_column (str): The name of the column containing text data. + label_column (str): The name of the column containing label data. + username (str): The username for the Hugging Face Hub. + project_name (str): The project name for saving datasets. + token (str): The authentication token for the Hugging Face Hub. + valid_data (Optional[pd.DataFrame]): The validation data. Defaults to None. + test_size (Optional[float]): The proportion of the dataset to include in the validation split. Defaults to 0.2. + seed (Optional[int]): The random seed for splitting the data. Defaults to 42. + convert_to_class_label (Optional[bool]): Whether to convert labels to class labels. Defaults to False. + local (Optional[bool]): Whether to save the dataset locally. Defaults to False. + + Methods: + __post_init__(): Validates the presence of required columns in the dataframes and checks for reserved column names. + split(): Splits the training data into training and validation sets if validation data is not provided. + prepare_columns(train_df, valid_df): Prepares the columns for training and validation dataframes. + prepare(): Prepares the datasets for training and validation, converts labels if required, and saves or uploads the datasets. + """ + train_data: pd.DataFrame text_column: str label_column: str @@ -122,10 +145,38 @@ def prepare(self): class TextMultiClassClassificationPreprocessor(TextBinaryClassificationPreprocessor): + """ + TextMultiClassClassificationPreprocessor is a class for preprocessing text data for multi-class classification tasks. + + This class inherits from TextBinaryClassificationPreprocessor and is designed to handle scenarios where the text data + needs to be classified into more than two categories. + + Methods: + Inherits all methods from TextBinaryClassificationPreprocessor. + + Attributes: + Inherits all attributes from TextBinaryClassificationPreprocessor. + """ + pass class TextSingleColumnRegressionPreprocessor(TextBinaryClassificationPreprocessor): + """ + A preprocessor class for single-column regression tasks, inheriting from TextBinaryClassificationPreprocessor. + + Methods + ------- + split(): + Splits the training data into training and validation sets. If validation data is already provided, it returns + the training and validation data as is. Otherwise, it performs a train-test split on the training data. + + prepare(): + Prepares the training and validation datasets by splitting the data, preparing the columns, and converting + them to Hugging Face Datasets. The datasets are then either saved locally or pushed to the Hugging Face Hub, + depending on the `local` attribute. + """ + def split(self): if self.valid_data is not None: return self.train_data, self.valid_data @@ -174,6 +225,21 @@ def prepare(self): class TextTokenClassificationPreprocessor(TextBinaryClassificationPreprocessor): + """ + A preprocessor class for text token classification tasks, inheriting from TextBinaryClassificationPreprocessor. + + Methods + ------- + split(): + Splits the training data into training and validation sets. If validation data is already provided, it returns + the training and validation data as is. Otherwise, it splits the training data based on the test size and seed. + + prepare(): + Prepares the training and validation data for token classification. This includes splitting the data, preparing + columns, evaluating text and label columns, and converting them to datasets. The datasets are then either saved + locally or pushed to the Hugging Face Hub based on the configuration. + """ + def split(self): if self.valid_data is not None: return self.train_data, self.valid_data @@ -243,6 +309,46 @@ def prepare(self): @dataclass class LLMPreprocessor: + """ + A class used to preprocess data for large language model (LLM) training. + + Attributes + ---------- + train_data : pd.DataFrame + The training data. + username : str + The username for the Hugging Face Hub. + project_name : str + The name of the project. + token : str + The token for authentication. + valid_data : Optional[pd.DataFrame], optional + The validation data, by default None. + test_size : Optional[float], optional + The size of the test split, by default 0.2. + seed : Optional[int], optional + The random seed, by default 42. + text_column : Optional[str], optional + The name of the text column, by default None. + prompt_column : Optional[str], optional + The name of the prompt column, by default None. + rejected_text_column : Optional[str], optional + The name of the rejected text column, by default None. + local : Optional[bool], optional + Whether to save the dataset locally, by default False. + + Methods + ------- + __post_init__() + Validates the provided columns and checks for reserved column names. + split() + Splits the data into training and validation sets. + prepare_columns(train_df, valid_df) + Prepares the columns for training and validation datasets. + prepare() + Prepares the datasets and pushes them to the Hugging Face Hub or saves them locally. + """ + train_data: pd.DataFrame username: str project_name: str @@ -339,6 +445,28 @@ def prepare(self): @dataclass class Seq2SeqPreprocessor: + """ + Seq2SeqPreprocessor is a class for preprocessing sequence-to-sequence training data. + + Attributes: + train_data (pd.DataFrame): The training data. + text_column (str): The name of the column containing the input text. + label_column (str): The name of the column containing the labels. + username (str): The username for pushing data to the hub. + project_name (str): The name of the project. + token (str): The token for authentication. + valid_data (Optional[pd.DataFrame]): The validation data. Default is None. + test_size (Optional[float]): The proportion of the dataset to include in the validation split. Default is 0.2. + seed (Optional[int]): The random seed for splitting the data. Default is 42. + local (Optional[bool]): Whether to save the dataset locally or push to the hub. Default is False. + + Methods: + __post_init__(): Validates the presence of required columns in the training and validation data. + split(): Splits the training data into training and validation sets if validation data is not provided. + prepare_columns(train_df, valid_df): Prepares the columns for training and validation data. + prepare(): Prepares the dataset for training by splitting, preparing columns, and converting to Dataset objects. + """ + train_data: pd.DataFrame text_column: str label_column: str @@ -430,6 +558,31 @@ def prepare(self): @dataclass class SentenceTransformersPreprocessor: + """ + A preprocessor class for preparing datasets for sentence transformers. + + Attributes: + train_data (pd.DataFrame): The training data. + username (str): The username for the Hugging Face Hub. + project_name (str): The project name for the Hugging Face Hub. + token (str): The token for authentication with the Hugging Face Hub. + valid_data (Optional[pd.DataFrame]): The validation data. Default is None. + test_size (Optional[float]): The proportion of the dataset to include in the validation split. Default is 0.2. + seed (Optional[int]): The random seed for splitting the data. Default is 42. + local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Default is False. + sentence1_column (Optional[str]): The name of the first sentence column. Default is "sentence1". + sentence2_column (Optional[str]): The name of the second sentence column. Default is "sentence2". + sentence3_column (Optional[str]): The name of the third sentence column. Default is "sentence3". + target_column (Optional[str]): The name of the target column. Default is "target". + convert_to_class_label (Optional[bool]): Whether to convert the target column to class labels. Default is False. + + Methods: + __post_init__(): Ensures no reserved columns are in train_data or valid_data. + split(): Splits the train_data into training and validation sets if valid_data is not provided. + prepare_columns(train_df, valid_df): Prepares the columns for training and validation datasets. + prepare(): Prepares the datasets and either saves them locally or pushes them to the Hugging Face Hub. + """ + train_data: pd.DataFrame username: str project_name: str @@ -530,6 +683,29 @@ def prepare(self): @dataclass class TextExtractiveQuestionAnsweringPreprocessor: + """ + Preprocessor for text extractive question answering tasks. + + Attributes: + train_data (pd.DataFrame): The training data. + text_column (str): The name of the text column in the data. + question_column (str): The name of the question column in the data. + answer_column (str): The name of the answer column in the data. + username (str): The username for the Hugging Face Hub. + project_name (str): The project name for the Hugging Face Hub. + token (str): The token for authentication with the Hugging Face Hub. + valid_data (Optional[pd.DataFrame]): The validation data. Default is None. + test_size (Optional[float]): The proportion of the dataset to include in the validation split. Default is 0.2. + seed (Optional[int]): The random seed for splitting the data. Default is 42. + local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Default is False. + + Methods: + __post_init__(): Validates the columns in the training and validation data and converts the answer column to a dictionary. + split(): Splits the training data into training and validation sets if validation data is not provided. + prepare_columns(train_df, valid_df): Prepares the columns for training and validation data. + prepare(): Prepares the dataset for training by splitting, preparing columns, and converting to Hugging Face Dataset format. + """ + train_data: pd.DataFrame text_column: str question_column: str diff --git a/src/autotrain/preprocessor/vision.py b/src/autotrain/preprocessor/vision.py index 4c2c7077f8..b1075888a5 100644 --- a/src/autotrain/preprocessor/vision.py +++ b/src/autotrain/preprocessor/vision.py @@ -14,6 +14,38 @@ @dataclass class ImageClassificationPreprocessor: + """ + A class used to preprocess image data for classification tasks. + + Attributes + ---------- + train_data : str + Path to the training data directory. + username : str + Username for the Hugging Face Hub. + project_name : str + Name of the project. + token : str + Authentication token for the Hugging Face Hub. + valid_data : Optional[str], optional + Path to the validation data directory, by default None. + test_size : Optional[float], optional + Proportion of the dataset to include in the validation split, by default 0.2. + seed : Optional[int], optional + Random seed for reproducibility, by default 42. + local : Optional[bool], optional + Whether to save the dataset locally or push to the Hugging Face Hub, by default False. + + Methods + ------- + __post_init__(): + Validates the structure and contents of the training and validation data directories. + split(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: + Splits the dataframe into training and validation sets. + prepare() -> str: + Prepares the dataset for training and either saves it locally or pushes it to the Hugging Face Hub. + """ + train_data: str username: str project_name: str @@ -160,6 +192,40 @@ def prepare(self): @dataclass class ObjectDetectionPreprocessor: + """ + A class to preprocess data for object detection tasks. + + Attributes: + ----------- + train_data : str + Path to the training data directory. + username : str + Username for the Hugging Face Hub. + project_name : str + Name of the project. + token : str + Authentication token for the Hugging Face Hub. + valid_data : Optional[str], default=None + Path to the validation data directory. + test_size : Optional[float], default=0.2 + Proportion of the dataset to include in the validation split. + seed : Optional[int], default=42 + Random seed for reproducibility. + local : Optional[bool], default=False + Whether to save the dataset locally or push to the Hugging Face Hub. + + Methods: + -------- + _process_metadata(data_path): + Processes the metadata.jsonl file and extracts required columns and categories. + __post_init__(): + Validates the existence and content of the training and validation data directories. + split(df): + Splits the dataframe into training and validation sets. + prepare(): + Prepares the dataset for training by processing metadata, splitting data, and saving or pushing the dataset. + """ + train_data: str username: str project_name: str diff --git a/src/autotrain/preprocessor/vlm.py b/src/autotrain/preprocessor/vlm.py index d08bca0b15..1f5edf1fd9 100644 --- a/src/autotrain/preprocessor/vlm.py +++ b/src/autotrain/preprocessor/vlm.py @@ -14,6 +14,38 @@ @dataclass class VLMPreprocessor: + """ + VLMPreprocessor is a class for preprocessing visual language model (VLM) datasets. It handles tasks such as + validating data paths, ensuring the presence of required files, splitting datasets, and preparing data for + training and validation. + + Attributes: + train_data (str): Path to the training data directory. + username (str): Username for the Hugging Face Hub. + project_name (str): Name of the project. + token (str): Authentication token for the Hugging Face Hub. + column_mapping (dict): Mapping of column names. + valid_data (Optional[str]): Path to the validation data directory. Default is None. + test_size (Optional[float]): Proportion of the dataset to include in the validation split. Default is 0.2. + seed (Optional[int]): Random seed for dataset splitting. Default is 42. + local (Optional[bool]): Flag indicating whether to save data locally or push to the Hugging Face Hub. Default is False. + + Methods: + _process_metadata(data_path): + Processes the metadata.jsonl file in the given data path and ensures it contains the required columns. + + __post_init__(): + Validates the existence of training and validation data paths, checks for required files, and ensures + the presence of a minimum number of image files. + + split(df): + Splits the given DataFrame into training and validation sets based on the specified test size and seed. + + prepare(): + Prepares the dataset for training and validation by copying data to a cache directory, processing metadata, + and either saving the dataset locally or pushing it to the Hugging Face Hub. + """ + train_data: str username: str project_name: str diff --git a/src/autotrain/project.py b/src/autotrain/project.py index 80f3d5b1cb..de65f153e1 100644 --- a/src/autotrain/project.py +++ b/src/autotrain/project.py @@ -26,6 +26,24 @@ @dataclass class AutoTrainProject: + """ + A class to represent an AutoTrain project + + Attributes + ---------- + params : Union[List[Union[LLMTrainingParams, TextClassificationParams, TabularParams, DreamBoothTrainingParams, Seq2SeqParams, ImageClassificationParams, TextRegressionParams, ObjectDetectionParams, TokenClassificationParams, SentenceTransformersParams, ImageRegressionParams]], LLMTrainingParams, TextClassificationParams, TabularParams, DreamBoothTrainingParams, Seq2SeqParams, ImageClassificationParams, TextRegressionParams, ObjectDetectionParams, TokenClassificationParams, SentenceTransformersParams, ImageRegressionParams] + The parameters for the AutoTrain project. + backend : str + The backend to be used for the AutoTrain project. + + Methods + ------- + __post_init__(): + Validates the backend attribute. + create(): + Creates a runner based on the backend and initializes the AutoTrain project. + """ + params: Union[ List[ Union[ diff --git a/src/autotrain/tools/convert_to_kohya.py b/src/autotrain/tools/convert_to_kohya.py index c42ab117e5..970aa1247a 100644 --- a/src/autotrain/tools/convert_to_kohya.py +++ b/src/autotrain/tools/convert_to_kohya.py @@ -5,6 +5,16 @@ def convert_to_kohya(input_path, output_path): + """ + Converts a Lora state dictionary to a Kohya state dictionary and saves it to the specified output path. + + Args: + input_path (str): The file path to the input Lora state dictionary. + output_path (str): The file path where the converted Kohya state dictionary will be saved. + + Returns: + None + """ logger.info(f"Converting Lora state dict from {input_path} to Kohya state dict at {output_path}") lora_state_dict = load_file(input_path) peft_state_dict = convert_all_state_dict_to_peft(lora_state_dict) diff --git a/src/autotrain/tools/merge_adapter.py b/src/autotrain/tools/merge_adapter.py index f17c56c5f1..df4d1a772a 100644 --- a/src/autotrain/tools/merge_adapter.py +++ b/src/autotrain/tools/merge_adapter.py @@ -9,6 +9,23 @@ def merge_llm_adapter( base_model_path, adapter_path, token, output_folder=None, pad_to_multiple_of=None, push_to_hub=False ): + """ + Merges a language model adapter into a base model and optionally saves or pushes the merged model. + + Args: + base_model_path (str): Path to the base model. + adapter_path (str): Path to the adapter model. + token (str): Authentication token for accessing the models. + output_folder (str, optional): Directory to save the merged model. Defaults to None. + pad_to_multiple_of (int, optional): If specified, pad the token embeddings to a multiple of this value. Defaults to None. + push_to_hub (bool, optional): If True, push the merged model to the Hugging Face Hub. Defaults to False. + + Raises: + ValueError: If neither `output_folder` nor `push_to_hub` is specified. + + Returns: + None + """ if output_folder is None and push_to_hub is False: raise ValueError("You must specify either --output_folder or --push_to_hub") diff --git a/src/autotrain/trainers/clm/params.py b/src/autotrain/trainers/clm/params.py index a2362bd68a..1a65b9007a 100644 --- a/src/autotrain/trainers/clm/params.py +++ b/src/autotrain/trainers/clm/params.py @@ -6,67 +6,135 @@ class LLMTrainingParams(AutoTrainParams): - model: str = Field("gpt2", title="Model name") - project_name: str = Field("project-name", title="Output directory") + """ + LLMTrainingParams: Parameters for training a language model using the autotrain library. + + Attributes: + model (str): Model name to be used for training. Default is "gpt2". + project_name (str): Name of the project and output directory. Default is "project-name". + + data_path (str): Path to the dataset. Default is "data". + train_split (str): Configuration for the training data split. Default is "train". + valid_split (Optional[str]): Configuration for the validation data split. Default is None. + add_eos_token (bool): Whether to add an EOS token at the end of sequences. Default is True. + block_size (Union[int, List[int]]): Size of the blocks for training, can be a single integer or a list of integers. Default is -1. + model_max_length (int): Maximum length of the model input. Default is 2048. + padding (Optional[str]): Side on which to pad sequences (left or right). Default is "right". + + trainer (str): Type of trainer to use. Default is "default". + use_flash_attention_2 (bool): Whether to use flash attention version 2. Default is False. + log (str): Logging method for experiment tracking. Default is "none". + disable_gradient_checkpointing (bool): Whether to disable gradient checkpointing. Default is False. + logging_steps (int): Number of steps between logging events. Default is -1. + eval_strategy (str): Strategy for evaluation (e.g., 'epoch'). Default is "epoch". + save_total_limit (int): Maximum number of checkpoints to keep. Default is 1. + auto_find_batch_size (bool): Whether to automatically find the optimal batch size. Default is False. + mixed_precision (Optional[str]): Type of mixed precision to use (e.g., 'fp16', 'bf16', or None). Default is None. + lr (float): Learning rate for training. Default is 3e-5. + epochs (int): Number of training epochs. Default is 1. + batch_size (int): Batch size for training. Default is 2. + warmup_ratio (float): Proportion of training to perform learning rate warmup. Default is 0.1. + gradient_accumulation (int): Number of steps to accumulate gradients before updating. Default is 4. + optimizer (str): Optimizer to use for training. Default is "adamw_torch". + scheduler (str): Learning rate scheduler to use. Default is "linear". + weight_decay (float): Weight decay to apply to the optimizer. Default is 0.0. + max_grad_norm (float): Maximum norm for gradient clipping. Default is 1.0. + seed (int): Random seed for reproducibility. Default is 42. + chat_template (Optional[str]): Template for chat-based models, options include: None, zephyr, chatml, or tokenizer. Default is None. + + quantization (Optional[str]): Quantization method to use (e.g., 'int4', 'int8', or None). Default is "int4". + target_modules (Optional[str]): Target modules for quantization or fine-tuning. Default is "all-linear". + merge_adapter (bool): Whether to merge the adapter layers. Default is False. + peft (bool): Whether to use Parameter-Efficient Fine-Tuning (PEFT). Default is False. + lora_r (int): Rank of the LoRA matrices. Default is 16. + lora_alpha (int): Alpha parameter for LoRA. Default is 32. + lora_dropout (float): Dropout rate for LoRA. Default is 0.05. + + model_ref (Optional[str]): Reference model for DPO trainer. Default is None. + dpo_beta (float): Beta parameter for DPO trainer. Default is 0.1. + + max_prompt_length (int): Maximum length of the prompt. Default is 128. + max_completion_length (Optional[int]): Maximum length of the completion. Default is None. + + prompt_text_column (Optional[str]): Column name for the prompt text. Default is None. + text_column (str): Column name for the text data. Default is "text". + rejected_text_column (Optional[str]): Column name for the rejected text data. Default is None. + + push_to_hub (bool): Whether to push the model to the Hugging Face Hub. Default is False. + username (Optional[str]): Hugging Face username for authentication. Default is None. + token (Optional[str]): Hugging Face token for authentication. Default is None. + + unsloth (bool): Whether to use the unsloth library. Default is False. + distributed_backend (Optional[str]): Backend to use for distributed training. Default is None. + """ + + model: str = Field("gpt2", title="Model name to be used for training") + project_name: str = Field("project-name", title="Name of the project and output directory") # data params - data_path: str = Field("data", title="Data path") - train_split: str = Field("train", title="Train data config") - valid_split: Optional[str] = Field(None, title="Validation data config") - add_eos_token: bool = Field(True, title="Add EOS token") - block_size: Union[int, List[int]] = Field(-1, title="Block size") - model_max_length: int = Field(2048, title="Model max length") - padding: Optional[str] = Field("right", title="Padding side") + data_path: str = Field("data", title="Path to the dataset") + train_split: str = Field("train", title="Configuration for the training data split") + valid_split: Optional[str] = Field(None, title="Configuration for the validation data split") + add_eos_token: bool = Field(True, title="Whether to add an EOS token at the end of sequences") + block_size: Union[int, List[int]] = Field( + -1, title="Size of the blocks for training, can be a single integer or a list of integers" + ) + model_max_length: int = Field(2048, title="Maximum length of the model input") + padding: Optional[str] = Field("right", title="Side on which to pad sequences (left or right)") # trainer params - trainer: str = Field("default", title="Trainer type") - use_flash_attention_2: bool = Field(False, title="Use flash attention 2") - log: str = Field("none", title="Logging using experiment tracking") - disable_gradient_checkpointing: bool = Field(False, title="Gradient checkpointing") - logging_steps: int = Field(-1, title="Logging steps") - eval_strategy: str = Field("epoch", title="Evaluation strategy") - save_total_limit: int = Field(1, title="Save total limit") - auto_find_batch_size: bool = Field(False, title="Auto find batch size") - mixed_precision: Optional[str] = Field(None, title="fp16, bf16, or None") - lr: float = Field(3e-5, title="Learning rate") + trainer: str = Field("default", title="Type of trainer to use") + use_flash_attention_2: bool = Field(False, title="Whether to use flash attention version 2") + log: str = Field("none", title="Logging method for experiment tracking") + disable_gradient_checkpointing: bool = Field(False, title="Whether to disable gradient checkpointing") + logging_steps: int = Field(-1, title="Number of steps between logging events") + eval_strategy: str = Field("epoch", title="Strategy for evaluation (e.g., 'epoch')") + save_total_limit: int = Field(1, title="Maximum number of checkpoints to keep") + auto_find_batch_size: bool = Field(False, title="Whether to automatically find the optimal batch size") + mixed_precision: Optional[str] = Field( + None, title="Type of mixed precision to use (e.g., 'fp16', 'bf16', or None)" + ) + lr: float = Field(3e-5, title="Learning rate for training") epochs: int = Field(1, title="Number of training epochs") - batch_size: int = Field(2, title="Training batch size") - warmup_ratio: float = Field(0.1, title="Warmup proportion") - gradient_accumulation: int = Field(4, title="Gradient accumulation steps") - optimizer: str = Field("adamw_torch", title="Optimizer") - scheduler: str = Field("linear", title="Scheduler") - weight_decay: float = Field(0.0, title="Weight decay") - max_grad_norm: float = Field(1.0, title="Max gradient norm") - seed: int = Field(42, title="Seed") - chat_template: Optional[str] = Field(None, title="Chat template, one of: None, zephyr, chatml or tokenizer") + batch_size: int = Field(2, title="Batch size for training") + warmup_ratio: float = Field(0.1, title="Proportion of training to perform learning rate warmup") + gradient_accumulation: int = Field(4, title="Number of steps to accumulate gradients before updating") + optimizer: str = Field("adamw_torch", title="Optimizer to use for training") + scheduler: str = Field("linear", title="Learning rate scheduler to use") + weight_decay: float = Field(0.0, title="Weight decay to apply to the optimizer") + max_grad_norm: float = Field(1.0, title="Maximum norm for gradient clipping") + seed: int = Field(42, title="Random seed for reproducibility") + chat_template: Optional[str] = Field( + None, title="Template for chat-based models, options include: None, zephyr, chatml, or tokenizer" + ) # peft - quantization: Optional[str] = Field("int4", title="int4, int8, or None") - target_modules: Optional[str] = Field("all-linear", title="Target modules") - merge_adapter: bool = Field(False, title="Merge adapter") - peft: bool = Field(False, title="Use PEFT") - lora_r: int = Field(16, title="Lora r") - lora_alpha: int = Field(32, title="Lora alpha") - lora_dropout: float = Field(0.05, title="Lora dropout") + quantization: Optional[str] = Field("int4", title="Quantization method to use (e.g., 'int4', 'int8', or None)") + target_modules: Optional[str] = Field("all-linear", title="Target modules for quantization or fine-tuning") + merge_adapter: bool = Field(False, title="Whether to merge the adapter layers") + peft: bool = Field(False, title="Whether to use Parameter-Efficient Fine-Tuning (PEFT)") + lora_r: int = Field(16, title="Rank of the LoRA matrices") + lora_alpha: int = Field(32, title="Alpha parameter for LoRA") + lora_dropout: float = Field(0.05, title="Dropout rate for LoRA") # dpo - model_ref: Optional[str] = Field(None, title="Reference, for DPO trainer") - dpo_beta: float = Field(0.1, title="Beta for DPO trainer") + model_ref: Optional[str] = Field(None, title="Reference model for DPO trainer") + dpo_beta: float = Field(0.1, title="Beta parameter for DPO trainer") # orpo + dpo - max_prompt_length: int = Field(128, title="Prompt length") - max_completion_length: Optional[int] = Field(None, title="Completion length") + max_prompt_length: int = Field(128, title="Maximum length of the prompt") + max_completion_length: Optional[int] = Field(None, title="Maximum length of the completion") # column mappings - prompt_text_column: Optional[str] = Field(None, title="Prompt text column") - text_column: str = Field("text", title="Text column") - rejected_text_column: Optional[str] = Field(None, title="Rejected text column") + prompt_text_column: Optional[str] = Field(None, title="Column name for the prompt text") + text_column: str = Field("text", title="Column name for the text data") + rejected_text_column: Optional[str] = Field(None, title="Column name for the rejected text data") # push to hub - push_to_hub: bool = Field(False, title="Push to hub") - username: Optional[str] = Field(None, title="Hugging Face Username") - token: Optional[str] = Field(None, title="Huggingface token") + push_to_hub: bool = Field(False, title="Whether to push the model to the Hugging Face Hub") + username: Optional[str] = Field(None, title="Hugging Face username for authentication") + token: Optional[str] = Field(None, title="Hugging Face token for authentication") # unsloth - unsloth: bool = Field(False, title="Use unsloth") - distributed_backend: Optional[str] = Field(None, title="Distributed backend") + unsloth: bool = Field(False, title="Whether to use the unsloth library") + distributed_backend: Optional[str] = Field(None, title="Backend to use for distributed training") diff --git a/src/autotrain/trainers/clm/utils.py b/src/autotrain/trainers/clm/utils.py index 4a49da4da3..78ef1df021 100644 --- a/src/autotrain/trainers/clm/utils.py +++ b/src/autotrain/trainers/clm/utils.py @@ -115,6 +115,20 @@ def list(cls): def preprocess_reward(examples, tokenizer): + """ + Preprocesses the reward data by tokenizing the chosen and rejected examples. + + Args: + examples (dict): A dictionary containing two keys, "chosen" and "rejected", each mapping to a list of text examples. + tokenizer (PreTrainedTokenizer): A tokenizer instance from the Hugging Face library used to tokenize the text examples. + + Returns: + dict: A dictionary with the following keys: + - "input_ids_chosen": List of tokenized input IDs for the chosen examples. + - "attention_mask_chosen": List of attention masks for the chosen examples. + - "input_ids_rejected": List of tokenized input IDs for the rejected examples. + - "attention_mask_rejected": List of attention masks for the rejected examples. + """ new_examples = { "input_ids_chosen": [], "attention_mask_chosen": [], @@ -134,6 +148,20 @@ def preprocess_reward(examples, tokenizer): def get_target_modules(config): + """ + Determines the target modules based on the provided configuration. + + Args: + config (object): Configuration object that contains the following attributes: + - target_modules (str or None): Specifies the target modules. It can be: + - None: Returns the default target modules for the model specified in the config. + - An empty string: Returns the default target modules for the model specified in the config. + - "all-linear": Returns the string "all-linear". + - A comma-separated string: Returns a list of target modules split by commas. + + Returns: + list or str: A list of target modules or a specific string ("all-linear") based on the configuration. + """ if config.target_modules is None: return TARGET_MODULES.get(config.model) if config.target_modules.strip() == "": @@ -144,6 +172,17 @@ def get_target_modules(config): def group_texts(examples, config): + """ + Groups texts into chunks of a specified block size. + + Args: + examples (dict): A dictionary where keys are feature names and values are lists of lists containing text data. + config (object): A configuration object that contains the block_size attribute. + + Returns: + dict: A dictionary with the same keys as the input examples, where each value is a list of chunks of text data. + Additionally, a "labels" key is added with the same value as the "input_ids" key. + """ # Concatenate all texts. concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) @@ -163,11 +202,33 @@ def group_texts(examples, config): def tokenize(examples, tokenizer, config): + """ + Tokenizes the input examples using the provided tokenizer and configuration. + + Args: + examples (dict): A dictionary containing the input examples to be tokenized. + tokenizer (PreTrainedTokenizer): The tokenizer to be used for tokenizing the examples. + config (object): Configuration object that contains the text column name. + + Returns: + dict: A dictionary containing the tokenized output. + """ output = tokenizer(examples[config.text_column]) return output def merge_adapter(base_model_path, target_model_path, adapter_path): + """ + Merges an adapter into a base model and saves the resulting model and tokenizer. + + Args: + base_model_path (str): Path to the base model directory. + target_model_path (str): Path to the directory where the merged model and tokenizer will be saved. + adapter_path (str): Path to the adapter model directory. + + Raises: + RuntimeError: If resizing token embeddings fails without padding to a multiple of 8. + """ logger.info("Loading adapter...") model = AutoModelForCausalLM.from_pretrained( base_model_path, @@ -194,6 +255,19 @@ def merge_adapter(base_model_path, target_model_path, adapter_path): def create_model_card(config): + """ + Generates a model card string based on the provided configuration. + + Args: + config (object): Configuration object with the following attributes: + - peft (bool): Indicates if PEFT (Parameter-Efficient Fine-Tuning) is used. + - data_path (str): Path to the dataset. + - project_name (str): Name of the project. + - model (str): Path or identifier of the model. + + Returns: + str: A formatted model card string. + """ if config.peft: peft = "\n- peft" else: @@ -218,6 +292,20 @@ def create_model_card(config): def pause_endpoint(params): + """ + Pauses a Hugging Face endpoint using the provided parameters. + + Args: + params (object): An object containing the necessary parameters, including: + - token (str): The authorization token to access the Hugging Face API. + + Returns: + dict: The JSON response from the API call. + + Raises: + KeyError: If the "ENDPOINT_ID" environment variable is not set. + requests.exceptions.RequestException: If there is an issue with the API request. + """ endpoint_id = os.environ["ENDPOINT_ID"] username = endpoint_id.split("/")[0] project_name = endpoint_id.split("/")[1] @@ -232,6 +320,23 @@ def apply_chat_template( tokenizer, config, ): + """ + Applies a chat template to the given example based on the specified configuration. + + Args: + example (dict): The input example containing the text data to be processed. + tokenizer (object): The tokenizer to be used for applying the chat template. + config (object): Configuration object containing the following attributes: + - trainer (str): Specifies the type of trainer. Can be "default", "sft", "reward", "dpo", or "orpo". + - text_column (str): The key in the example dict that contains the text data. + - chat_template (str): Specifies the chat template to be used. Relevant for "reward" and "dpo" trainers. + + Returns: + dict: The modified example with the chat template applied. + + Raises: + ValueError: If the required keys are not found in the example for "reward", "dpo", or "orpo" trainers. + """ # kudos to Hugging Face H4 Team for this snippet if config.trainer in ("default", "sft"): messages = example[config.text_column] @@ -285,6 +390,29 @@ def apply_chat_template( def post_training_steps(config, trainer): + """ + Perform post-training steps including saving the model, creating a model card, merging adapter weights, + and optionally pushing the model to the Hugging Face Hub. + + Args: + config (object): Configuration object containing various settings and parameters. + trainer (object): Trainer object used for training the model. + + Steps: + 1. Save the trained model and set `use_cache` to True. + 2. Create a model card and save it as README.md in the output directory. + 3. If PEFT (Parameter-Efficient Fine-Tuning) and adapter merging are enabled: + - Delete the trainer object and clear CUDA cache. + - Merge adapter weights into the base model. + - Remove adapter weight files from the output directory. + 4. If pushing to the Hugging Face Hub is enabled: + - Remove training data folder. + - Push the model to the Hugging Face Hub repository. + 5. Pause the space if the process index is 0. + + Raises: + Exception: If merging adapter weights fails. + """ logger.info("Finished training, saving model...") trainer.model.config.use_cache = True trainer.save_model(config.project_name) @@ -335,6 +463,26 @@ def post_training_steps(config, trainer): def process_input_data(config): + """ + Processes input data based on the provided configuration. + + Args: + config (object): Configuration object containing the following attributes: + - data_path (str): Path to the dataset. + - project_name (str): Name of the project. + - train_split (str): Split name for training data. + - valid_split (str, optional): Split name for validation data. + - token (str, optional): Token for accessing the dataset. + - text_column (str): Name of the text column. + - rejected_text_column (str): Name of the rejected text column. + - prompt_text_column (str): Name of the prompt text column. + - trainer (str): Type of trainer (e.g., "dpo", "reward", "orpo"). + + Returns: + tuple: A tuple containing: + - train_data (Dataset): Processed training dataset. + - valid_data (Dataset or None): Processed validation dataset if valid_split is provided, otherwise None. + """ if config.data_path == f"{config.project_name}/autotrain-data": logger.info("loading dataset from disk") train_data = load_from_disk(config.data_path)[config.train_split] @@ -402,6 +550,20 @@ def process_input_data(config): def get_tokenizer(config): + """ + Initializes and returns a tokenizer based on the provided configuration. + + Args: + config (object): Configuration object containing the following attributes: + - chat_template (str): The chat template type, either "chatml" or "zephyr". + - model (str): The model identifier to load the tokenizer from. + - token (str): The token to use for the tokenizer. + - model_max_length (int): The maximum length of the model. + - padding (str): The padding side, either "left" or "right". + + Returns: + tokenizer (PreTrainedTokenizer): The initialized tokenizer with the specified configuration. + """ special_tokens = None chat_template = None if config.chat_template == "chatml": @@ -445,6 +607,24 @@ def get_tokenizer(config): def process_data_with_chat_template(config, tokenizer, train_data, valid_data): + """ + Processes training and validation data using a specified chat template. + + Args: + config (object): Configuration object containing settings and parameters. + tokenizer (object): Tokenizer object used for tokenizing the data. + train_data (Dataset): Training dataset to be processed. + valid_data (Dataset): Validation dataset to be processed. + + Returns: + tuple: A tuple containing the processed training and validation datasets. + + Notes: + - If `config.chat_template` is one of ("chatml", "zephyr", "tokenizer"), the chat template will be applied. + - Logs information about the application of the chat template. + - For ORPO/DPO, the `prompt` will be extracted from chosen messages. + - If `config.valid_split` is not None, the validation data will also be processed. + """ valid_data = None if config.chat_template in ("chatml", "zephyr", "tokenizer"): logger.info("Applying chat template") @@ -468,6 +648,22 @@ def process_data_with_chat_template(config, tokenizer, train_data, valid_data): def configure_logging_steps(config, train_data, valid_data): + """ + Configures the logging steps for training based on the provided configuration and data. + + Parameters: + config (object): Configuration object containing training parameters, including `logging_steps`, `valid_split`, and `batch_size`. + train_data (iterable): Training dataset. + valid_data (iterable): Validation dataset. + + Returns: + int: The number of logging steps to be used during training. + + Notes: + - If `config.logging_steps` is set to -1, the function calculates logging steps based on 20% of the length of the validation data (if `valid_split` is provided) or the training data. + - The calculated logging steps are constrained to be between 1 and 25. + - If `config.logging_steps` is not -1, the function uses the provided value. + """ logger.info("configuring logging steps") if config.logging_steps == -1: if config.valid_split is not None: @@ -486,6 +682,40 @@ def configure_logging_steps(config, train_data, valid_data): def configure_training_args(config, logging_steps): + """ + Configures the training arguments for a language model based on the provided configuration. + + Args: + config (object): Configuration object containing various training parameters. + logging_steps (int): Number of steps between logging events. + + Returns: + dict: A dictionary containing the configured training arguments. + + The configuration object `config` should have the following attributes: + - project_name (str): The name of the project, used as the output directory. + - batch_size (int): Batch size for both training and evaluation. + - lr (float): Learning rate. + - epochs (int): Number of training epochs. + - eval_strategy (str): Evaluation strategy, e.g., "steps" or "epoch". + - valid_split (float or None): Validation split ratio. If None, evaluation is disabled. + - save_total_limit (int): Maximum number of checkpoints to save. + - gradient_accumulation (int): Number of gradient accumulation steps. + - log (str): Logging destination, e.g., "tensorboard". + - auto_find_batch_size (bool): Whether to automatically find the optimal batch size. + - scheduler (str): Learning rate scheduler type. + - optimizer (str): Optimizer type. + - warmup_ratio (float): Warmup ratio for learning rate scheduling. + - weight_decay (float): Weight decay for the optimizer. + - max_grad_norm (float): Maximum gradient norm for clipping. + - disable_gradient_checkpointing (bool): Whether to disable gradient checkpointing. + - peft (bool): Whether to use Parameter-Efficient Fine-Tuning (PEFT). + - quantization (str): Quantization type, e.g., "int4" or "int8". + - mixed_precision (str): Mixed precision type, e.g., "fp16" or "bf16". + + The function also sets additional training arguments based on the provided configuration, + such as enabling gradient checkpointing and mixed precision training. + """ logger.info("configuring training args") training_args = dict( output_dir=config.project_name, @@ -527,6 +757,21 @@ def configure_training_args(config, logging_steps): def configure_block_size(config, tokenizer): + """ + Configures the block size for the given configuration and tokenizer. + + This function sets the `block_size` attribute in the `config` object based on the `tokenizer`'s maximum model length. + If `config.block_size` is -1, it is set to None. If `config.block_size` is None, it defaults to the tokenizer's + `model_max_length` but not exceeding 1024. If `config.block_size` is specified and exceeds the tokenizer's + `model_max_length`, a warning is logged and the block size is set to the tokenizer's `model_max_length`. + + Args: + config (object): Configuration object that contains the `block_size` attribute. + tokenizer (object): Tokenizer object that contains the `model_max_length` attribute. + + Returns: + object: The updated configuration object with the `block_size` attribute set. + """ if config.block_size == -1: config.block_size = None @@ -554,6 +799,19 @@ def configure_block_size(config, tokenizer): def get_callbacks(config): + """ + Generate a list of callback instances based on the provided configuration. + + This function creates a list of callback instances that are used during the training process. + It includes default callbacks for logging and training start, and conditionally adds callbacks + for saving and loading PEFT models based on the configuration and environment settings. + + Args: + config (object): Configuration object containing training settings and parameters. + + Returns: + list: A list of callback instances to be used during training. + """ is_deepspeed_enabled = os.environ.get("ACCELERATE_USE_DEEPSPEED", "False").lower() == "true" callbacks = [UploadLogs(config=config), LossLoggingCallback(), TrainStartCallback()] if config.peft and not is_deepspeed_enabled: @@ -564,6 +822,34 @@ def get_callbacks(config): def get_model(config, tokenizer): + """ + Loads and configures a language model based on the provided configuration and tokenizer. + + Args: + config (Namespace): Configuration object containing model parameters and settings. + - model (str): The model name or path. + - token (str): Token for accessing the model. + - unsloth (bool): Flag to determine if unsloth is used. + - trainer (str): Type of trainer to use. + - target_modules (str): Target modules for unsloth. + - peft (bool): Flag to determine if PEFT (Parameter-Efficient Fine-Tuning) is used. + - quantization (str): Quantization type, either "int4" or "int8". + - mixed_precision (str): Mixed precision type, either "fp16" or "bf16". + - block_size (int): Maximum sequence length. + - lora_r (int): LoRA rank. + - lora_alpha (int): LoRA alpha. + - lora_dropout (float): LoRA dropout rate. + - seed (int): Random seed. + - disable_gradient_checkpointing (bool): Flag to disable gradient checkpointing. + - use_flash_attention_2 (bool): Flag to use flash attention 2. + tokenizer (PreTrainedTokenizer): Tokenizer to use with the model. + + Returns: + PreTrainedModel: The configured language model. + + Raises: + ImportError: If unsloth is not available when required. + """ model_config = AutoConfig.from_pretrained( config.model, token=config.token, diff --git a/src/autotrain/trainers/common.py b/src/autotrain/trainers/common.py index 73e4fc1df5..1db188015f 100644 --- a/src/autotrain/trainers/common.py +++ b/src/autotrain/trainers/common.py @@ -21,6 +21,15 @@ def get_file_sizes(directory): + """ + Calculate the sizes of all files in a given directory and its subdirectories. + + Args: + directory (str): The path to the directory to scan for files. + + Returns: + dict: A dictionary where the keys are the file paths and the values are the file sizes in gigabytes (GB). + """ file_sizes = {} for root, _, files in os.walk(directory): for file in files: @@ -32,6 +41,19 @@ def get_file_sizes(directory): def remove_global_step(directory): + """ + Removes directories that start with 'global_step' within the specified directory. + + This function traverses the given directory and its subdirectories in a bottom-up manner. + If it finds any directory whose name starts with 'global_step', it deletes that directory + and all its contents. + + Args: + directory (str): The path to the directory to be traversed and cleaned. + + Returns: + None + """ for root, dirs, _ in os.walk(directory, topdown=False): for name in dirs: if name.startswith("global_step"): @@ -41,11 +63,30 @@ def remove_global_step(directory): def remove_autotrain_data(config): + """ + Removes the AutoTrain data directory and global step for a given project. + + Args: + config (object): Configuration object that contains the project name. + + Raises: + OSError: If the removal of the directory fails. + """ os.system(f"rm -rf {config.project_name}/autotrain-data") remove_global_step(config.project_name) def save_training_params(config): + """ + Saves the training parameters to a JSON file, excluding the "token" key if it exists. + + Args: + config (object): Configuration object that contains the project name. + + The function checks if a file named 'training_params.json' exists in the directory + specified by `config.project_name`. If the file exists, it loads the JSON content, + removes the "token" key if present, and then writes the updated content back to the file. + """ if os.path.exists(f"{config.project_name}/training_params.json"): training_params = json.load(open(f"{config.project_name}/training_params.json")) if "token" in training_params: @@ -58,6 +99,24 @@ def save_training_params(config): def pause_endpoint(params): + """ + Pauses a Hugging Face endpoint using the provided parameters. + + Args: + params (dict or object): Parameters containing the token required for authorization. + If a dictionary is provided, it should have a key "token" with the authorization token. + If an object is provided, it should have an attribute `token` with the authorization token. + + Returns: + dict: The JSON response from the API call to pause the endpoint. + + Raises: + KeyError: If the "token" key is missing in the params dictionary. + requests.exceptions.RequestException: If there is an issue with the API request. + + Environment Variables: + ENDPOINT_ID: Should be set to the endpoint identifier in the format "username/project_name". + """ if isinstance(params, dict): token = params["token"] else: @@ -72,6 +131,26 @@ def pause_endpoint(params): def pause_space(params, is_failure=False): + """ + Pauses the Hugging Face space and optionally shuts down the endpoint. + + This function checks for the presence of "SPACE_ID" and "ENDPOINT_ID" in the environment variables. + If "SPACE_ID" is found, it pauses the space and creates a discussion on the Hugging Face platform + to notify the user about the status of the training run (success or failure). + If "ENDPOINT_ID" is found, it pauses the endpoint. + + Args: + params (object): An object containing the necessary parameters, including the token, username, and project name. + is_failure (bool, optional): A flag indicating whether the training run failed. Defaults to False. + + Raises: + Exception: If there is an error while creating the discussion on the Hugging Face platform. + + Logs: + Info: Logs the status of pausing the space and endpoint. + Warning: Logs any issues encountered while creating the discussion. + Error: Logs if the model failed to train and the discussion was not created. + """ if "SPACE_ID" in os.environ: # shut down the space logger.info("Pausing space...") @@ -108,6 +187,22 @@ def pause_space(params, is_failure=False): def monitor(func): + """ + A decorator that wraps a function to monitor its execution and handle exceptions. + + This decorator performs the following actions: + 1. Retrieves the 'config' parameter from the function's keyword arguments or positional arguments. + 2. Executes the wrapped function. + 3. If an exception occurs during the execution of the wrapped function, logs the error message and stack trace. + 4. Optionally pauses the execution if the environment variable 'PAUSE_ON_FAILURE' is set to 1. + + Args: + func (callable): The function to be wrapped by the decorator. + + Returns: + callable: The wrapped function with monitoring capabilities. + """ + def wrapper(*args, **kwargs): config = kwargs.get("config", None) if config is None and len(args) > 0: @@ -127,7 +222,18 @@ def wrapper(*args, **kwargs): class AutoTrainParams(BaseModel): """ - Base class for all AutoTrain parameters. + AutoTrainParams is a base class for all AutoTrain parameters. + Attributes: + Config (class): Configuration class for Pydantic model. + protected_namespaces (tuple): Protected namespaces for the model. + Methods: + save(output_dir): + Save parameters to a JSON file in the specified output directory. + __str__(): + Return a string representation of the parameters, masking the token if present. + __init__(**data): + Initialize the parameters, check for unused/extra parameters, and warn the user if necessary. + Raises ValueError if project_name is not alphanumeric (with hyphens allowed) or exceeds 50 characters. """ class Config: @@ -181,6 +287,22 @@ def __init__(self, **data): class UploadLogs(TrainerCallback): + """ + A callback to upload training logs to the Hugging Face Hub. + + Args: + config (object): Configuration object containing necessary parameters. + + Attributes: + config (object): Configuration object containing necessary parameters. + api (HfApi or None): Instance of HfApi for interacting with the Hugging Face Hub. + last_upload_time (float): Timestamp of the last upload. + + Methods: + on_step_end(args, state, control, **kwargs): + Called at the end of each training step. Uploads logs to the Hugging Face Hub if conditions are met. + """ + def __init__(self, config): self.config = config self.api = None @@ -219,6 +341,26 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra class LossLoggingCallback(TrainerCallback): + """ + LossLoggingCallback is a custom callback for logging loss during training. + + This callback inherits from `TrainerCallback` and overrides the `on_log` method + to remove the "total_flos" key from the logs and log the remaining information + if the current process is the local process zero. + + Methods: + on_log(args, state, control, logs=None, **kwargs): + Called when the logs are updated. Removes the "total_flos" key from the logs + and logs the remaining information if the current process is the local process zero. + + Args: + args: The training arguments. + state: The current state of the Trainer. + control: The control object for the Trainer. + logs (dict, optional): The logs dictionary containing the training metrics. + **kwargs: Additional keyword arguments. + """ + def on_log(self, args, state, control, logs=None, **kwargs): _ = logs.pop("total_flos", None) if state.is_local_process_zero: @@ -226,5 +368,19 @@ def on_log(self, args, state, control, logs=None, **kwargs): class TrainStartCallback(TrainerCallback): + """ + TrainStartCallback is a custom callback for the Trainer class that logs a message when training begins. + + Methods: + on_train_begin(args, state, control, **kwargs): + Logs a message indicating that training is starting. + + Args: + args: The training arguments. + state: The current state of the Trainer. + control: The control object for the Trainer. + **kwargs: Additional keyword arguments. + """ + def on_train_begin(self, args, state, control, **kwargs): logger.info("Starting to train...") diff --git a/src/autotrain/trainers/dreambooth/params.py b/src/autotrain/trainers/dreambooth/params.py index ff15a9bd96..6109b85976 100644 --- a/src/autotrain/trainers/dreambooth/params.py +++ b/src/autotrain/trainers/dreambooth/params.py @@ -6,72 +6,135 @@ class DreamBoothTrainingParams(AutoTrainParams): - model: str = Field(None, title="Model name") - vae_model: Optional[str] = Field(None, title="VAE model name") - revision: Optional[str] = Field(None, title="Revision") - tokenizer: Optional[str] = Field(None, title="Tokenizer, if different from model") - image_path: str = Field(None, title="Image path") - class_image_path: Optional[str] = Field(None, title="Class image path") - prompt: str = Field(None, title="Instance prompt") - class_prompt: Optional[str] = Field(None, title="Class prompt") - num_class_images: int = Field(100, title="Number of class images") - class_labels_conditioning: Optional[str] = Field(None, title="Class labels conditioning") - - prior_preservation: bool = Field(False, title="With prior preservation") - prior_loss_weight: float = Field(1.0, title="Prior loss weight") - - project_name: str = Field("dreambooth-model", title="Output directory") - seed: int = Field(42, title="Seed") - resolution: int = Field(512, title="Resolution") - center_crop: bool = Field(False, title="Center crop") - train_text_encoder: bool = Field(False, title="Train text encoder") - batch_size: int = Field(4, title="Train batch size") - sample_batch_size: int = Field(4, title="Sample batch size") + """ + DreamBoothTrainingParams + + Attributes: + model (str): Name of the model to be used for training. + vae_model (Optional[str]): Name of the VAE model to be used, if any. + revision (Optional[str]): Specific model version to use. + tokenizer (Optional[str]): Tokenizer to be used, if different from the model. + image_path (str): Path to the training images. + class_image_path (Optional[str]): Path to the class images. + prompt (str): Prompt for the instance images. + class_prompt (Optional[str]): Prompt for the class images. + num_class_images (int): Number of class images to generate. + class_labels_conditioning (Optional[str]): Conditioning labels for class images. + prior_preservation (bool): Enable prior preservation during training. + prior_loss_weight (float): Weight of the prior preservation loss. + project_name (str): Name of the project for output directory. + seed (int): Random seed for reproducibility. + resolution (int): Resolution of the training images. + center_crop (bool): Enable center cropping of images. + train_text_encoder (bool): Enable training of the text encoder. + batch_size (int): Batch size for training. + sample_batch_size (int): Batch size for sampling. + epochs (int): Number of training epochs. + num_steps (int): Maximum number of training steps. + checkpointing_steps (int): Steps interval for checkpointing. + resume_from_checkpoint (Optional[str]): Path to resume training from a checkpoint. + gradient_accumulation (int): Number of gradient accumulation steps. + disable_gradient_checkpointing (bool): Disable gradient checkpointing. + lr (float): Learning rate for training. + scale_lr (bool): Enable scaling of the learning rate. + scheduler (str): Type of learning rate scheduler. + warmup_steps (int): Number of warmup steps for learning rate scheduler. + num_cycles (int): Number of cycles for learning rate scheduler. + lr_power (float): Power factor for learning rate scheduler. + dataloader_num_workers (int): Number of workers for data loading. + use_8bit_adam (bool): Enable use of 8-bit Adam optimizer. + adam_beta1 (float): Beta1 parameter for Adam optimizer. + adam_beta2 (float): Beta2 parameter for Adam optimizer. + adam_weight_decay (float): Weight decay for Adam optimizer. + adam_epsilon (float): Epsilon parameter for Adam optimizer. + max_grad_norm (float): Maximum gradient norm for clipping. + allow_tf32 (bool): Allow use of TF32 for training. + prior_generation_precision (Optional[str]): Precision for prior generation. + local_rank (int): Local rank for distributed training. + xformers (bool): Enable xformers memory efficient attention. + pre_compute_text_embeddings (bool): Pre-compute text embeddings before training. + tokenizer_max_length (Optional[int]): Maximum length for tokenizer. + text_encoder_use_attention_mask (bool): Use attention mask for text encoder. + rank (int): Rank for distributed training. + xl (bool): Enable XL model training. + mixed_precision (Optional[str]): Enable mixed precision training. + token (Optional[str]): Token for accessing the model hub. + push_to_hub (bool): Enable pushing the model to the hub. + username (Optional[str]): Username for the model hub. + validation_prompt (Optional[str]): Prompt for validation images. + num_validation_images (int): Number of validation images to generate. + validation_epochs (int): Epoch interval for validation. + checkpoints_total_limit (Optional[int]): Total limit for checkpoints. + validation_images (Optional[str]): Path to validation images. + logging (bool): Enable logging using TensorBoard. + """ + + model: str = Field(None, title="Name of the model to be used for training") + vae_model: Optional[str] = Field(None, title="Name of the VAE model to be used, if any") + revision: Optional[str] = Field(None, title="Specific model version to use") + tokenizer: Optional[str] = Field(None, title="Tokenizer to be used, if different from the model") + image_path: str = Field(None, title="Path to the training images") + class_image_path: Optional[str] = Field(None, title="Path to the class images") + prompt: str = Field(None, title="Prompt for the instance images") + class_prompt: Optional[str] = Field(None, title="Prompt for the class images") + num_class_images: int = Field(100, title="Number of class images to generate") + class_labels_conditioning: Optional[str] = Field(None, title="Conditioning labels for class images") + + prior_preservation: bool = Field(False, title="Enable prior preservation during training") + prior_loss_weight: float = Field(1.0, title="Weight of the prior preservation loss") + + project_name: str = Field("dreambooth-model", title="Name of the project for output directory") + seed: int = Field(42, title="Random seed for reproducibility") + resolution: int = Field(512, title="Resolution of the training images") + center_crop: bool = Field(False, title="Enable center cropping of images") + train_text_encoder: bool = Field(False, title="Enable training of the text encoder") + batch_size: int = Field(4, title="Batch size for training") + sample_batch_size: int = Field(4, title="Batch size for sampling") epochs: int = Field(1, title="Number of training epochs") - num_steps: int = Field(None, title="Max train steps") - checkpointing_steps: int = Field(500, title="Checkpointing steps") - resume_from_checkpoint: Optional[str] = Field(None, title="Resume from checkpoint") - - gradient_accumulation: int = Field(1, title="Gradient accumulation steps") - disable_gradient_checkpointing: bool = Field(False, title="Gradient checkpointing") - - lr: float = Field(1e-4, title="Learning rate") - scale_lr: bool = Field(False, title="Scale learning rate") - scheduler: str = Field("constant", title="Learning rate scheduler") - warmup_steps: int = Field(0, title="Learning rate warmup steps") - num_cycles: int = Field(1, title="Learning rate num cycles") - lr_power: float = Field(1.0, title="Learning rate power") - - dataloader_num_workers: int = Field(0, title="Dataloader num workers") - use_8bit_adam: bool = Field(False, title="Use 8bit adam") - adam_beta1: float = Field(0.9, title="Adam beta 1") - adam_beta2: float = Field(0.999, title="Adam beta 2") - adam_weight_decay: float = Field(1e-2, title="Adam weight decay") - adam_epsilon: float = Field(1e-8, title="Adam epsilon") - max_grad_norm: float = Field(1.0, title="Max grad norm") - - allow_tf32: bool = Field(False, title="Allow TF32") - prior_generation_precision: Optional[str] = Field(None, title="Prior generation precision") - local_rank: int = Field(-1, title="Local rank") + num_steps: int = Field(None, title="Maximum number of training steps") + checkpointing_steps: int = Field(500, title="Steps interval for checkpointing") + resume_from_checkpoint: Optional[str] = Field(None, title="Path to resume training from a checkpoint") + + gradient_accumulation: int = Field(1, title="Number of gradient accumulation steps") + disable_gradient_checkpointing: bool = Field(False, title="Disable gradient checkpointing") + + lr: float = Field(1e-4, title="Learning rate for training") + scale_lr: bool = Field(False, title="Enable scaling of the learning rate") + scheduler: str = Field("constant", title="Type of learning rate scheduler") + warmup_steps: int = Field(0, title="Number of warmup steps for learning rate scheduler") + num_cycles: int = Field(1, title="Number of cycles for learning rate scheduler") + lr_power: float = Field(1.0, title="Power factor for learning rate scheduler") + + dataloader_num_workers: int = Field(0, title="Number of workers for data loading") + use_8bit_adam: bool = Field(False, title="Enable use of 8-bit Adam optimizer") + adam_beta1: float = Field(0.9, title="Beta1 parameter for Adam optimizer") + adam_beta2: float = Field(0.999, title="Beta2 parameter for Adam optimizer") + adam_weight_decay: float = Field(1e-2, title="Weight decay for Adam optimizer") + adam_epsilon: float = Field(1e-8, title="Epsilon parameter for Adam optimizer") + max_grad_norm: float = Field(1.0, title="Maximum gradient norm for clipping") + + allow_tf32: bool = Field(False, title="Allow use of TF32 for training") + prior_generation_precision: Optional[str] = Field(None, title="Precision for prior generation") + local_rank: int = Field(-1, title="Local rank for distributed training") xformers: bool = Field(False, title="Enable xformers memory efficient attention") - pre_compute_text_embeddings: bool = Field(False, title="Pre compute text embeddings") - tokenizer_max_length: Optional[int] = Field(None, title="Tokenizer max length") - text_encoder_use_attention_mask: bool = Field(False, title="Text encoder use attention mask") + pre_compute_text_embeddings: bool = Field(False, title="Pre-compute text embeddings before training") + tokenizer_max_length: Optional[int] = Field(None, title="Maximum length for tokenizer") + text_encoder_use_attention_mask: bool = Field(False, title="Use attention mask for text encoder") - rank: int = Field(4, title="Rank") - xl: bool = Field(False, title="XL") + rank: int = Field(4, title="Rank for distributed training") + xl: bool = Field(False, title="Enable XL model training") - mixed_precision: Optional[str] = Field(None, title="Mixed precision") + mixed_precision: Optional[str] = Field(None, title="Enable mixed precision training") - token: Optional[str] = Field(None, title="Hub token") - push_to_hub: bool = Field(False, title="Push to hub") - username: Optional[str] = Field(None, title="Hub username") + token: Optional[str] = Field(None, title="Token for accessing the model hub") + push_to_hub: bool = Field(False, title="Enable pushing the model to the hub") + username: Optional[str] = Field(None, title="Username for the model hub") # disabled: - validation_prompt: Optional[str] = Field(None, title="Validation prompt") - num_validation_images: int = Field(4, title="Number of validation images") - validation_epochs: int = Field(50, title="Validation epochs") - checkpoints_total_limit: Optional[int] = Field(None, title="Checkpoints total limit") - validation_images: Optional[str] = Field(None, title="Validation images") + validation_prompt: Optional[str] = Field(None, title="Prompt for validation images") + num_validation_images: int = Field(4, title="Number of validation images to generate") + validation_epochs: int = Field(50, title="Epoch interval for validation") + checkpoints_total_limit: Optional[int] = Field(None, title="Total limit for checkpoints") + validation_images: Optional[str] = Field(None, title="Path to validation images") - logging: bool = Field(False, title="Logging using tensorboard") + logging: bool = Field(False, title="Enable logging using TensorBoard") diff --git a/src/autotrain/trainers/extractive_question_answering/dataset.py b/src/autotrain/trainers/extractive_question_answering/dataset.py index 70939b7cf9..c9429f6242 100644 --- a/src/autotrain/trainers/extractive_question_answering/dataset.py +++ b/src/autotrain/trainers/extractive_question_answering/dataset.py @@ -83,6 +83,25 @@ def _prepare_dataset(examples, tokenizer, config): class ExtractiveQuestionAnsweringDataset: + """ + A dataset class for extractive question answering tasks. + + Args: + data (Dataset): The dataset to be processed. + tokenizer (PreTrainedTokenizer): The tokenizer to be used for processing the data. + config (dict): Configuration parameters for processing the dataset. + + Attributes: + data (Dataset): The original dataset. + tokenizer (PreTrainedTokenizer): The tokenizer used for processing the data. + config (dict): Configuration parameters for processing the dataset. + tokenized_data (Dataset): The tokenized dataset after applying the mapping function. + + Methods: + __len__(): Returns the length of the tokenized dataset. + __getitem__(item): Returns the tokenized data at the specified index. + """ + def __init__(self, data, tokenizer, config): self.data = data self.tokenizer = tokenizer diff --git a/src/autotrain/trainers/extractive_question_answering/params.py b/src/autotrain/trainers/extractive_question_answering/params.py index 62c01655f1..61e1f8a686 100644 --- a/src/autotrain/trainers/extractive_question_answering/params.py +++ b/src/autotrain/trainers/extractive_question_answering/params.py @@ -6,34 +6,71 @@ class ExtractiveQuestionAnsweringParams(AutoTrainParams): - data_path: str = Field(None, title="Data path") - model: str = Field("bert-base-uncased", title="Model name") - lr: float = Field(5e-5, title="Learning rate") + """ + ExtractiveQuestionAnsweringParams + + Parameters: + data_path (str): Path to the dataset. + model (str): Pre-trained model name. Default is "bert-base-uncased". + lr (float): Learning rate for the optimizer. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + max_seq_length (int): Maximum sequence length for inputs. Default is 128. + max_doc_stride (int): Maximum document stride for splitting context. Default is 128. + batch_size (int): Batch size for training. Default is 8. + warmup_ratio (float): Warmup proportion for learning rate scheduler. Default is 0.1. + gradient_accumulation (int): Number of gradient accumulation steps. Default is 1. + optimizer (str): Optimizer type. Default is "adamw_torch". + scheduler (str): Learning rate scheduler type. Default is "linear". + weight_decay (float): Weight decay for the optimizer. Default is 0.0. + max_grad_norm (float): Maximum gradient norm for clipping. Default is 1.0. + seed (int): Random seed for reproducibility. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. Default is None. + text_column (str): Column name for context/text. Default is "context". + question_column (str): Column name for questions. Default is "question". + answer_column (str): Column name for answers. Default is "answers". + logging_steps (int): Number of steps between logging. Default is -1. + project_name (str): Name of the project for output directory. Default is "project-name". + auto_find_batch_size (bool): Automatically find optimal batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision training mode (fp16, bf16, or None). Default is None. + save_total_limit (int): Maximum number of checkpoints to save. Default is 1. + token (Optional[str]): Authentication token for Hugging Face Hub. Default is None. + push_to_hub (bool): Whether to push the model to Hugging Face Hub. Default is False. + eval_strategy (str): Evaluation strategy during training. Default is "epoch". + username (Optional[str]): Hugging Face username for authentication. Default is None. + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Number of epochs with no improvement for early stopping. Default is 5. + early_stopping_threshold (float): Threshold for early stopping improvement. Default is 0.01. + """ + + data_path: str = Field(None, title="Path to the dataset") + model: str = Field("bert-base-uncased", title="Pre-trained model name") + lr: float = Field(5e-5, title="Learning rate for the optimizer") epochs: int = Field(3, title="Number of training epochs") - max_seq_length: int = Field(128, title="Max sequence length") - max_doc_stride: int = Field(128, title="Max doc stride") - batch_size: int = Field(8, title="Training batch size") - warmup_ratio: float = Field(0.1, title="Warmup proportion") - gradient_accumulation: int = Field(1, title="Gradient accumulation steps") - optimizer: str = Field("adamw_torch", title="Optimizer") - scheduler: str = Field("linear", title="Scheduler") - weight_decay: float = Field(0.0, title="Weight decay") - max_grad_norm: float = Field(1.0, title="Max gradient norm") - seed: int = Field(42, title="Seed") - train_split: str = Field("train", title="Train split") - valid_split: Optional[str] = Field(None, title="Validation split") - text_column: str = Field("context", title="context/text column") - question_column: str = Field("question", title="question column") - answer_column: str = Field("answers", title="answer column") - logging_steps: int = Field(-1, title="Logging steps") - project_name: str = Field("project-name", title="Output directory") - auto_find_batch_size: bool = Field(False, title="Auto find batch size") - mixed_precision: Optional[str] = Field(None, title="fp16, bf16, or None") - save_total_limit: int = Field(1, title="Save total limit") - token: Optional[str] = Field(None, title="Hub Token") - push_to_hub: bool = Field(False, title="Push to hub") - eval_strategy: str = Field("epoch", title="Evaluation strategy") - username: Optional[str] = Field(None, title="Hugging Face Username") - log: str = Field("none", title="Logging using experiment tracking") - early_stopping_patience: int = Field(5, title="Early stopping patience") - early_stopping_threshold: float = Field(0.01, title="Early stopping threshold") + max_seq_length: int = Field(128, title="Maximum sequence length for inputs") + max_doc_stride: int = Field(128, title="Maximum document stride for splitting context") + batch_size: int = Field(8, title="Batch size for training") + warmup_ratio: float = Field(0.1, title="Warmup proportion for learning rate scheduler") + gradient_accumulation: int = Field(1, title="Number of gradient accumulation steps") + optimizer: str = Field("adamw_torch", title="Optimizer type") + scheduler: str = Field("linear", title="Learning rate scheduler type") + weight_decay: float = Field(0.0, title="Weight decay for the optimizer") + max_grad_norm: float = Field(1.0, title="Maximum gradient norm for clipping") + seed: int = Field(42, title="Random seed for reproducibility") + train_split: str = Field("train", title="Name of the training data split") + valid_split: Optional[str] = Field(None, title="Name of the validation data split") + text_column: str = Field("context", title="Column name for context/text") + question_column: str = Field("question", title="Column name for questions") + answer_column: str = Field("answers", title="Column name for answers") + logging_steps: int = Field(-1, title="Number of steps between logging") + project_name: str = Field("project-name", title="Name of the project for output directory") + auto_find_batch_size: bool = Field(False, title="Automatically find optimal batch size") + mixed_precision: Optional[str] = Field(None, title="Mixed precision training mode (fp16, bf16, or None)") + save_total_limit: int = Field(1, title="Maximum number of checkpoints to save") + token: Optional[str] = Field(None, title="Authentication token for Hugging Face Hub") + push_to_hub: bool = Field(False, title="Whether to push the model to Hugging Face Hub") + eval_strategy: str = Field("epoch", title="Evaluation strategy during training") + username: Optional[str] = Field(None, title="Hugging Face username for authentication") + log: str = Field("none", title="Logging method for experiment tracking") + early_stopping_patience: int = Field(5, title="Number of epochs with no improvement for early stopping") + early_stopping_threshold: float = Field(0.01, title="Threshold for early stopping improvement") diff --git a/src/autotrain/trainers/generic/__main__.py b/src/autotrain/trainers/generic/__main__.py index bb3af9ecaf..2dc4c07b78 100644 --- a/src/autotrain/trainers/generic/__main__.py +++ b/src/autotrain/trainers/generic/__main__.py @@ -16,6 +16,20 @@ def parse_args(): @monitor def run(config): + """ + Executes a series of operations based on the provided configuration. + + This function performs the following steps: + 1. Converts the configuration dictionary to a GenericParams object if necessary. + 2. Downloads the data repository specified in the configuration. + 3. Uninstalls any existing requirements specified in the configuration. + 4. Installs the necessary requirements specified in the configuration. + 5. Runs a command specified in the configuration. + 6. Pauses the space as specified in the configuration. + + Args: + config (dict or GenericParams): The configuration for the operations to be performed. + """ if isinstance(config, dict): config = GenericParams(**config) diff --git a/src/autotrain/trainers/generic/params.py b/src/autotrain/trainers/generic/params.py index 4bf67479d1..8d826a4bec 100644 --- a/src/autotrain/trainers/generic/params.py +++ b/src/autotrain/trainers/generic/params.py @@ -6,10 +6,31 @@ class GenericParams(AutoTrainParams): - username: str = Field(None, title="Hugging Face Username") - project_name: str = Field("project-name", title="path to script.py") - data_path: str = Field(None, title="Data path") - token: str = Field(None, title="Hub Token") - script_path: str = Field(None, title="Script path") - env: Optional[Dict[str, str]] = Field(None, title="Environment Variables") - args: Optional[Dict[str, str]] = Field(None, title="Arguments") + """ + GenericParams is a class that holds configuration parameters for an AutoTrain SpaceRunner project. + + Attributes: + username (str): The username for your Hugging Face account. + project_name (str): The name of the project. + data_path (str): The file path to the dataset. + token (str): The authentication token for accessing Hugging Face Hub. + script_path (str): The file path to the script to be executed. Path to script.py. + env (Optional[Dict[str, str]]): A dictionary of environment variables to be set. + args (Optional[Dict[str, str]]): A dictionary of arguments to be passed to the script. + """ + + username: str = Field( + None, title="Hugging Face Username", description="The username for your Hugging Face account." + ) + project_name: str = Field("project-name", title="Project Name", description="The name of the project.") + data_path: str = Field(None, title="Data Path", description="The file path to the dataset.") + token: str = Field(None, title="Hub Token", description="The authentication token for accessing Hugging Face Hub.") + script_path: str = Field( + None, title="Script Path", description="The file path to the script to be executed. Path to script.py" + ) + env: Optional[Dict[str, str]] = Field( + None, title="Environment Variables", description="A dictionary of environment variables to be set." + ) + args: Optional[Dict[str, str]] = Field( + None, title="Arguments", description="A dictionary of arguments to be passed to the script." + ) diff --git a/src/autotrain/trainers/generic/utils.py b/src/autotrain/trainers/generic/utils.py index 00255c934c..1290631432 100644 --- a/src/autotrain/trainers/generic/utils.py +++ b/src/autotrain/trainers/generic/utils.py @@ -8,6 +8,18 @@ def create_dataset_repo(username, project_name, script_path, token): + """ + Creates a new dataset repository on Hugging Face and uploads the specified dataset. + + Args: + username (str): The username of the Hugging Face account. + project_name (str): The name of the project for which the dataset repository is being created. + script_path (str): The local path to the dataset folder that needs to be uploaded. + token (str): The authentication token for the Hugging Face API. + + Returns: + str: The repository ID of the newly created dataset repository. + """ logger.info("Creating dataset repo...") api = HfApi(token=token) repo_id = f"{username}/autotrain-{project_name}" @@ -27,6 +39,18 @@ def create_dataset_repo(username, project_name, script_path, token): def pull_dataset_repo(params): + """ + Downloads a dataset repository from Hugging Face Hub. + + Args: + params (object): An object containing the following attributes: + - data_path (str): The repository ID of the dataset. + - project_name (str): The local directory where the dataset will be downloaded. + - token (str): The authentication token for accessing the repository. + + Returns: + None + """ snapshot_download( repo_id=params.data_path, local_dir=params.project_name, @@ -36,6 +60,20 @@ def pull_dataset_repo(params): def uninstall_requirements(params): + """ + Uninstalls the requirements specified in the requirements.txt file of a given project. + + This function reads the requirements.txt file located in the project's directory, + extracts the packages to be uninstalled, writes them to an uninstall.txt file, + and then uses pip to uninstall those packages. + + Args: + params (object): An object containing the project_name attribute, which specifies + the directory of the project. + + Returns: + None + """ if os.path.exists(f"{params.project_name}/requirements.txt"): # read the requirements.txt uninstall_list = [] @@ -65,6 +103,22 @@ def uninstall_requirements(params): def install_requirements(params): + """ + Installs the Python packages listed in the requirements.txt file located in the specified project directory. + + Args: + params: An object containing the project_name attribute, which specifies the directory of the project. + + Behavior: + - Checks if a requirements.txt file exists in the project directory. + - Reads the requirements.txt file and filters out lines starting with a hyphen. + - Rewrites the filtered requirements back to the requirements.txt file. + - Uses subprocess to run the pip install command on the requirements.txt file. + - Logs the installation status. + + Returns: + None + """ # check if params.project_name has a requirements.txt if os.path.exists(f"{params.project_name}/requirements.txt"): # install the requirements using subprocess, wait for it to finish @@ -96,6 +150,20 @@ def install_requirements(params): def run_command(params): + """ + Executes a Python script with optional arguments in a specified project directory. + + Args: + params (object): An object containing the following attributes: + - project_name (str): The name of the project directory where the script is located. + - args (dict): A dictionary of arguments to pass to the script. Keys are argument names, and values are argument values. + + Raises: + ValueError: If the script.py file is not found in the specified project directory. + + Returns: + None + """ if os.path.exists(f"{params.project_name}/script.py"): cmd = ["python", "script.py"] if params.args: @@ -111,6 +179,19 @@ def run_command(params): def pause_endpoint(params): + """ + Pauses a specific endpoint using the Hugging Face API. + + This function retrieves the endpoint ID from the environment variables, + extracts the username and project name from the endpoint ID, constructs + the API URL, and sends a POST request to pause the endpoint. + + Args: + params (object): An object containing the token attribute for authorization. + + Returns: + dict: The JSON response from the API call. + """ endpoint_id = os.environ["ENDPOINT_ID"] username = endpoint_id.split("/")[0] project_name = endpoint_id.split("/")[1] diff --git a/src/autotrain/trainers/image_classification/dataset.py b/src/autotrain/trainers/image_classification/dataset.py index 74f9e37f1d..10d7e6c1af 100644 --- a/src/autotrain/trainers/image_classification/dataset.py +++ b/src/autotrain/trainers/image_classification/dataset.py @@ -3,6 +3,28 @@ class ImageClassificationDataset: + """ + A custom dataset class for image classification tasks. + + Args: + data (list): A list of data samples, where each sample is a dictionary containing image and target information. + transforms (callable): A function/transform that takes in an image and returns a transformed version. + config (object): A configuration object containing the column names for images and targets. + + Attributes: + data (list): The dataset containing image and target information. + transforms (callable): The transformation function to be applied to the images. + config (object): The configuration object with image and target column names. + + Methods: + __len__(): Returns the number of samples in the dataset. + __getitem__(item): Retrieves the image and target at the specified index, applies transformations, and returns them as tensors. + + Example: + dataset = ImageClassificationDataset(data, transforms, config) + image, target = dataset[0] + """ + def __init__(self, data, transforms, config): self.data = data self.transforms = transforms diff --git a/src/autotrain/trainers/image_classification/params.py b/src/autotrain/trainers/image_classification/params.py index 05c3c2a0c0..c213972066 100644 --- a/src/autotrain/trainers/image_classification/params.py +++ b/src/autotrain/trainers/image_classification/params.py @@ -6,31 +6,65 @@ class ImageClassificationParams(AutoTrainParams): - data_path: str = Field(None, title="Data path") - model: str = Field("google/vit-base-patch16-224", title="Model name") - username: Optional[str] = Field(None, title="Hugging Face Username") - lr: float = Field(5e-5, title="Learning rate") - epochs: int = Field(3, title="Number of training epochs") - batch_size: int = Field(8, title="Training batch size") - warmup_ratio: float = Field(0.1, title="Warmup proportion") - gradient_accumulation: int = Field(1, title="Gradient accumulation steps") - optimizer: str = Field("adamw_torch", title="Optimizer") - scheduler: str = Field("linear", title="Scheduler") - weight_decay: float = Field(0.0, title="Weight decay") - max_grad_norm: float = Field(1.0, title="Max gradient norm") - seed: int = Field(42, title="Seed") - train_split: str = Field("train", title="Train split") - valid_split: Optional[str] = Field(None, title="Validation split") - logging_steps: int = Field(-1, title="Logging steps") - project_name: str = Field("project-name", title="Output directory") - auto_find_batch_size: bool = Field(False, title="Auto find batch size") - mixed_precision: Optional[str] = Field(None, title="fp16, bf16, or None") - save_total_limit: int = Field(1, title="Save total limit") - token: Optional[str] = Field(None, title="Hub Token") - push_to_hub: bool = Field(False, title="Push to hub") - eval_strategy: str = Field("epoch", title="Evaluation strategy") - image_column: str = Field("image", title="Image column") - target_column: str = Field("target", title="Target column") - log: str = Field("none", title="Logging using experiment tracking") - early_stopping_patience: int = Field(5, title="Early stopping patience") - early_stopping_threshold: float = Field(0.01, title="Early stopping threshold") + """ + ImageClassificationParams is a configuration class for image classification training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Pre-trained model name or path. Default is "google/vit-base-patch16-224". + username (Optional[str]): Hugging Face account username. + lr (float): Learning rate for the optimizer. Default is 5e-5. + epochs (int): Number of epochs for training. Default is 3. + batch_size (int): Batch size for training. Default is 8. + warmup_ratio (float): Warmup ratio for learning rate scheduler. Default is 0.1. + gradient_accumulation (int): Number of gradient accumulation steps. Default is 1. + optimizer (str): Optimizer type. Default is "adamw_torch". + scheduler (str): Learning rate scheduler type. Default is "linear". + weight_decay (float): Weight decay for the optimizer. Default is 0.0. + max_grad_norm (float): Maximum gradient norm for clipping. Default is 1.0. + seed (int): Random seed for reproducibility. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. + logging_steps (int): Number of steps between logging. Default is -1. + project_name (str): Name of the project for output directory. Default is "project-name". + auto_find_batch_size (bool): Automatically find optimal batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision training mode (fp16, bf16, or None). + save_total_limit (int): Maximum number of checkpoints to keep. Default is 1. + token (Optional[str]): Hugging Face Hub token for authentication. + push_to_hub (bool): Whether to push the model to Hugging Face Hub. Default is False. + eval_strategy (str): Evaluation strategy during training. Default is "epoch". + image_column (str): Column name for images in the dataset. Default is "image". + target_column (str): Column name for target labels in the dataset. Default is "target". + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Number of epochs with no improvement for early stopping. Default is 5. + early_stopping_threshold (float): Threshold for early stopping. Default is 0.01. + """ + + data_path: str = Field(None, title="Path to the dataset") + model: str = Field("google/vit-base-patch16-224", title="Pre-trained model name or path") + username: Optional[str] = Field(None, title="Hugging Face account username") + lr: float = Field(5e-5, title="Learning rate for the optimizer") + epochs: int = Field(3, title="Number of epochs for training") + batch_size: int = Field(8, title="Batch size for training") + warmup_ratio: float = Field(0.1, title="Warmup ratio for learning rate scheduler") + gradient_accumulation: int = Field(1, title="Number of gradient accumulation steps") + optimizer: str = Field("adamw_torch", title="Optimizer type") + scheduler: str = Field("linear", title="Learning rate scheduler type") + weight_decay: float = Field(0.0, title="Weight decay for the optimizer") + max_grad_norm: float = Field(1.0, title="Maximum gradient norm for clipping") + seed: int = Field(42, title="Random seed for reproducibility") + train_split: str = Field("train", title="Name of the training data split") + valid_split: Optional[str] = Field(None, title="Name of the validation data split") + logging_steps: int = Field(-1, title="Number of steps between logging") + project_name: str = Field("project-name", title="Name of the project for output directory") + auto_find_batch_size: bool = Field(False, title="Automatically find optimal batch size") + mixed_precision: Optional[str] = Field(None, title="Mixed precision training mode (fp16, bf16, or None)") + save_total_limit: int = Field(1, title="Maximum number of checkpoints to keep") + token: Optional[str] = Field(None, title="Hugging Face Hub token for authentication") + push_to_hub: bool = Field(False, title="Whether to push the model to Hugging Face Hub") + eval_strategy: str = Field("epoch", title="Evaluation strategy during training") + image_column: str = Field("image", title="Column name for images in the dataset") + target_column: str = Field("target", title="Column name for target labels in the dataset") + log: str = Field("none", title="Logging method for experiment tracking") + early_stopping_patience: int = Field(5, title="Number of epochs with no improvement for early stopping") + early_stopping_threshold: float = Field(0.01, title="Threshold for early stopping") diff --git a/src/autotrain/trainers/image_classification/utils.py b/src/autotrain/trainers/image_classification/utils.py index ce64d73fcd..74b0a1cc3f 100644 --- a/src/autotrain/trainers/image_classification/utils.py +++ b/src/autotrain/trainers/image_classification/utils.py @@ -54,6 +54,22 @@ def _binary_classification_metrics(pred): + """ + Computes various binary classification metrics given the predictions and labels. + + Args: + pred (tuple): A tuple containing raw predictions and true labels. + raw_predictions (numpy.ndarray): The raw prediction scores from the model. + labels (numpy.ndarray): The true labels. + + Returns: + dict: A dictionary containing the following metrics: + - f1 (float): The F1 score. + - precision (float): The precision score. + - recall (float): The recall score. + - auc (float): The Area Under the ROC Curve (AUC) score. + - accuracy (float): The accuracy score. + """ raw_predictions, labels = pred predictions = np.argmax(raw_predictions, axis=1) result = { @@ -67,6 +83,27 @@ def _binary_classification_metrics(pred): def _multi_class_classification_metrics(pred): + """ + Compute various classification metrics for multi-class classification. + + Args: + pred (tuple): A tuple containing raw predictions and true labels. + - raw_predictions (numpy.ndarray): The raw prediction scores for each class. + - labels (numpy.ndarray): The true labels. + + Returns: + dict: A dictionary containing the following metrics: + - "f1_macro": F1 score with macro averaging. + - "f1_micro": F1 score with micro averaging. + - "f1_weighted": F1 score with weighted averaging. + - "precision_macro": Precision score with macro averaging. + - "precision_micro": Precision score with micro averaging. + - "precision_weighted": Precision score with weighted averaging. + - "recall_macro": Recall score with macro averaging. + - "recall_micro": Recall score with micro averaging. + - "recall_weighted": Recall score with weighted averaging. + - "accuracy": Accuracy score. + """ raw_predictions, labels = pred predictions = np.argmax(raw_predictions, axis=1) results = { @@ -85,6 +122,18 @@ def _multi_class_classification_metrics(pred): def process_data(train_data, valid_data, image_processor, config): + """ + Processes training and validation data for image classification. + + Args: + train_data (Dataset): The training dataset. + valid_data (Dataset or None): The validation dataset. Can be None if no validation data is provided. + image_processor (ImageProcessor): An object containing image processing parameters such as size, mean, and std. + config (dict): Configuration dictionary containing additional parameters for dataset processing. + + Returns: + tuple: A tuple containing the processed training dataset and the processed validation dataset (or None if no validation data is provided). + """ if "shortest_edge" in image_processor.size: size = image_processor.size["shortest_edge"] else: @@ -119,6 +168,26 @@ def process_data(train_data, valid_data, image_processor, config): def create_model_card(config, trainer, num_classes): + """ + Generates a model card for the given configuration and trainer. + + Args: + config (object): Configuration object containing various settings. + trainer (object): Trainer object used for model training and evaluation. + num_classes (int): Number of classes in the classification task. + + Returns: + str: A formatted string representing the model card. + + The function evaluates the model if a validation split is provided in the config. + It then formats the evaluation scores based on whether the task is binary or multi-class classification. + If no validation split is provided, it notes that no validation metrics are available. + + The function also checks the data path and model path in the config to determine if they are directories. + Based on these checks, it formats the dataset tag and base model information accordingly. + + Finally, it uses the formatted information to create and return the model card string. + """ if config.valid_split is not None: eval_scores = trainer.evaluate() valid_metrics = ( diff --git a/src/autotrain/trainers/image_regression/dataset.py b/src/autotrain/trainers/image_regression/dataset.py index 1581f58776..43c01a13e1 100644 --- a/src/autotrain/trainers/image_regression/dataset.py +++ b/src/autotrain/trainers/image_regression/dataset.py @@ -3,6 +3,24 @@ class ImageRegressionDataset: + """ + A dataset class for image regression tasks. + + Args: + data (list): A list of data points where each data point is a dictionary containing image and target information. + transforms (callable): A function/transform that takes in an image and returns a transformed version. + config (object): A configuration object that contains the column names for images and targets. + + Attributes: + data (list): The input data. + transforms (callable): The transformation function. + config (object): The configuration object. + + Methods: + __len__(): Returns the number of data points in the dataset. + __getitem__(item): Returns a dictionary containing the transformed image and the target value for the given index. + """ + def __init__(self, data, transforms, config): self.data = data self.transforms = transforms diff --git a/src/autotrain/trainers/image_regression/params.py b/src/autotrain/trainers/image_regression/params.py index 7bdb3fad73..c47b1eb7e8 100644 --- a/src/autotrain/trainers/image_regression/params.py +++ b/src/autotrain/trainers/image_regression/params.py @@ -6,6 +6,40 @@ class ImageRegressionParams(AutoTrainParams): + """ + ImageRegressionParams is a configuration class for image regression training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the model to use. Default is "google/vit-base-patch16-224". + username (Optional[str]): Hugging Face Username. + lr (float): Learning rate. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + batch_size (int): Training batch size. Default is 8. + warmup_ratio (float): Warmup proportion. Default is 0.1. + gradient_accumulation (int): Gradient accumulation steps. Default is 1. + optimizer (str): Optimizer to use. Default is "adamw_torch". + scheduler (str): Scheduler to use. Default is "linear". + weight_decay (float): Weight decay. Default is 0.0. + max_grad_norm (float): Max gradient norm. Default is 1.0. + seed (int): Random seed. Default is 42. + train_split (str): Train split name. Default is "train". + valid_split (Optional[str]): Validation split name. + logging_steps (int): Logging steps. Default is -1. + project_name (str): Output directory name. Default is "project-name". + auto_find_batch_size (bool): Whether to auto find batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision type (fp16, bf16, or None). + save_total_limit (int): Save total limit. Default is 1. + token (Optional[str]): Hub Token. + push_to_hub (bool): Whether to push to hub. Default is False. + eval_strategy (str): Evaluation strategy. Default is "epoch". + image_column (str): Image column name. Default is "image". + target_column (str): Target column name. Default is "target". + log (str): Logging using experiment tracking. Default is "none". + early_stopping_patience (int): Early stopping patience. Default is 5. + early_stopping_threshold (float): Early stopping threshold. Default is 0.01. + """ + data_path: str = Field(None, title="Data path") model: str = Field("google/vit-base-patch16-224", title="Model name") username: Optional[str] = Field(None, title="Hugging Face Username") diff --git a/src/autotrain/trainers/image_regression/utils.py b/src/autotrain/trainers/image_regression/utils.py index 423f69fe41..881d2eede5 100644 --- a/src/autotrain/trainers/image_regression/utils.py +++ b/src/autotrain/trainers/image_regression/utils.py @@ -43,6 +43,24 @@ def image_regression_metrics(pred): + """ + Calculate various regression metrics for image regression tasks. + + Args: + pred (tuple): A tuple containing raw predictions and labels. + raw_predictions should be a list of lists or a list of numpy.float32 values. + labels should be a list of true values. + + Returns: + dict: A dictionary containing the calculated metrics: + - 'mse': Mean Squared Error + - 'mae': Mean Absolute Error + - 'r2': R^2 Score + - 'rmse': Root Mean Squared Error + - 'explained_variance': Explained Variance Score + + If an error occurs during the calculation of a metric, the value for that metric will be -999. + """ raw_predictions, labels = pred try: @@ -70,6 +88,18 @@ def image_regression_metrics(pred): def process_data(train_data, valid_data, image_processor, config): + """ + Processes training and validation data by applying image transformations. + + Args: + train_data (Dataset): The training dataset. + valid_data (Dataset or None): The validation dataset. If None, only training data is processed. + image_processor (ImageProcessor): An object containing image processing parameters such as size, mean, and std. + config (dict): Configuration dictionary containing additional parameters for the dataset. + + Returns: + tuple: A tuple containing the processed training dataset and the processed validation dataset (or None if valid_data is None). + """ if "shortest_edge" in image_processor.size: size = image_processor.size["shortest_edge"] else: @@ -104,6 +134,19 @@ def process_data(train_data, valid_data, image_processor, config): def create_model_card(config, trainer): + """ + Generates a model card string based on the provided configuration and trainer. + + Args: + config (object): Configuration object containing various settings such as + valid_split, data_path, project_name, and model. + trainer (object): Trainer object used to evaluate the model if validation + split is provided. + + Returns: + str: A formatted model card string containing dataset information, + validation metrics, and base model details. + """ if config.valid_split is not None: eval_scores = trainer.evaluate() eval_scores = [f"{k[len('eval_'):]}: {v}" for k, v in eval_scores.items() if k in VALID_METRICS] diff --git a/src/autotrain/trainers/object_detection/dataset.py b/src/autotrain/trainers/object_detection/dataset.py index 8e6085392a..6d9315f60a 100644 --- a/src/autotrain/trainers/object_detection/dataset.py +++ b/src/autotrain/trainers/object_detection/dataset.py @@ -2,6 +2,30 @@ class ObjectDetectionDataset: + """ + A dataset class for object detection tasks. + + Args: + data (list): A list of data entries where each entry is a dictionary containing image and object information. + transforms (callable): A function or transform to apply to the images and bounding boxes. + image_processor (callable): A function or processor to convert images and annotations into the desired format. + config (object): A configuration object containing column names for images and objects. + + Attributes: + data (list): The dataset containing image and object information. + transforms (callable): The transform function to apply to the images and bounding boxes. + image_processor (callable): The processor to convert images and annotations into the desired format. + config (object): The configuration object with column names for images and objects. + + Methods: + __len__(): Returns the number of items in the dataset. + __getitem__(item): Retrieves and processes the image and annotations for the given index. + + Example: + dataset = ObjectDetectionDataset(data, transforms, image_processor, config) + image_data = dataset[0] + """ + def __init__(self, data, transforms, image_processor, config): self.data = data self.transforms = transforms diff --git a/src/autotrain/trainers/object_detection/params.py b/src/autotrain/trainers/object_detection/params.py index cf1e00d74e..19cca75601 100644 --- a/src/autotrain/trainers/object_detection/params.py +++ b/src/autotrain/trainers/object_detection/params.py @@ -6,6 +6,41 @@ class ObjectDetectionParams(AutoTrainParams): + """ + ObjectDetectionParams is a configuration class for object detection training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the model to be used. Default is "google/vit-base-patch16-224". + username (Optional[str]): Hugging Face Username. + lr (float): Learning rate. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + batch_size (int): Training batch size. Default is 8. + warmup_ratio (float): Warmup proportion. Default is 0.1. + gradient_accumulation (int): Gradient accumulation steps. Default is 1. + optimizer (str): Optimizer to be used. Default is "adamw_torch". + scheduler (str): Scheduler to be used. Default is "linear". + weight_decay (float): Weight decay. Default is 0.0. + max_grad_norm (float): Max gradient norm. Default is 1.0. + seed (int): Random seed. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. + logging_steps (int): Number of steps between logging. Default is -1. + project_name (str): Name of the project for output directory. Default is "project-name". + auto_find_batch_size (bool): Whether to automatically find batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision type (fp16, bf16, or None). + save_total_limit (int): Total number of checkpoints to save. Default is 1. + token (Optional[str]): Hub Token for authentication. + push_to_hub (bool): Whether to push the model to the Hugging Face Hub. Default is False. + eval_strategy (str): Evaluation strategy. Default is "epoch". + image_column (str): Name of the image column in the dataset. Default is "image". + objects_column (str): Name of the target column in the dataset. Default is "objects". + log (str): Logging method for experiment tracking. Default is "none". + image_square_size (Optional[int]): Longest size to which the image will be resized, then padded to square. Default is 600. + early_stopping_patience (int): Number of epochs with no improvement after which training will be stopped. Default is 5. + early_stopping_threshold (float): Minimum change to qualify as an improvement. Default is 0.01. + """ + data_path: str = Field(None, title="Data path") model: str = Field("google/vit-base-patch16-224", title="Model name") username: Optional[str] = Field(None, title="Hugging Face Username") diff --git a/src/autotrain/trainers/object_detection/utils.py b/src/autotrain/trainers/object_detection/utils.py index 415f2f30ff..a55ea2a218 100644 --- a/src/autotrain/trainers/object_detection/utils.py +++ b/src/autotrain/trainers/object_detection/utils.py @@ -50,6 +50,20 @@ def collate_fn(batch): + """ + Collates a batch of data for object detection training. + + Args: + batch (list): A list of dictionaries, where each dictionary contains + 'pixel_values', 'labels', and optionally 'pixel_mask'. + + Returns: + dict: A dictionary with the following keys: + - 'pixel_values' (torch.Tensor): A tensor containing stacked pixel values from the batch. + - 'labels' (list): A list of labels from the batch. + - 'pixel_mask' (torch.Tensor, optional): A tensor containing stacked pixel masks from the batch, + if 'pixel_mask' is present in the input batch. + """ data = {} data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch]) data["labels"] = [x["labels"] for x in batch] @@ -59,6 +73,18 @@ def collate_fn(batch): def process_data(train_data, valid_data, image_processor, config): + """ + Processes training and validation data for object detection. + + Args: + train_data (list): List of training data samples. + valid_data (list or None): List of validation data samples. If None, only training data is processed. + image_processor (object): An image processor object that contains image processing configurations. + config (dict): Configuration dictionary containing various settings for data processing. + + Returns: + tuple: A tuple containing processed training data and validation data (if provided). If validation data is not provided, the second element of the tuple is None. + """ max_size = image_processor.size["longest_edge"] basic_transforms = [ A.LongestMaxSize(max_size=max_size), @@ -203,6 +229,20 @@ class ModelOutput: def create_model_card(config, trainer): + """ + Generates a model card string based on the provided configuration and trainer. + + Args: + config (object): Configuration object containing the following attributes: + - valid_split (optional): Validation split information. + - data_path (str): Path to the dataset. + - project_name (str): Name of the project. + - model (str): Path or identifier of the model. + trainer (object): Trainer object with an `evaluate` method that returns evaluation metrics. + + Returns: + str: A formatted model card string containing dataset information, validation metrics, and base model details. + """ if config.valid_split is not None: eval_scores = trainer.evaluate() eval_scores = [f"{k[len('eval_'):]}: {v}" for k, v in eval_scores.items() if k in VALID_METRICS] diff --git a/src/autotrain/trainers/sent_transformers/params.py b/src/autotrain/trainers/sent_transformers/params.py index dcf06c0293..10d8c5f378 100644 --- a/src/autotrain/trainers/sent_transformers/params.py +++ b/src/autotrain/trainers/sent_transformers/params.py @@ -6,6 +6,44 @@ class SentenceTransformersParams(AutoTrainParams): + """ + SentenceTransformersParams is a configuration class for setting up parameters for training sentence transformers. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the pre-trained model to use. Default is "microsoft/mpnet-base". + lr (float): Learning rate for training. Default is 3e-5. + epochs (int): Number of training epochs. Default is 3. + max_seq_length (int): Maximum sequence length for the input. Default is 128. + batch_size (int): Batch size for training. Default is 8. + warmup_ratio (float): Proportion of training to perform learning rate warmup. Default is 0.1. + gradient_accumulation (int): Number of steps to accumulate gradients before updating. Default is 1. + optimizer (str): Optimizer to use. Default is "adamw_torch". + scheduler (str): Learning rate scheduler to use. Default is "linear". + weight_decay (float): Weight decay to apply. Default is 0.0. + max_grad_norm (float): Maximum gradient norm for clipping. Default is 1.0. + seed (int): Random seed for reproducibility. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. Default is None. + logging_steps (int): Number of steps between logging. Default is -1. + project_name (str): Name of the project for output directory. Default is "project-name". + auto_find_batch_size (bool): Whether to automatically find the optimal batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision training mode (fp16, bf16, or None). Default is None. + save_total_limit (int): Maximum number of checkpoints to save. Default is 1. + token (Optional[str]): Token for accessing Hugging Face Hub. Default is None. + push_to_hub (bool): Whether to push the model to Hugging Face Hub. Default is False. + eval_strategy (str): Evaluation strategy to use. Default is "epoch". + username (Optional[str]): Hugging Face username. Default is None. + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Number of epochs with no improvement after which training will be stopped. Default is 5. + early_stopping_threshold (float): Threshold for measuring the new optimum, to qualify as an improvement. Default is 0.01. + trainer (str): Name of the trainer to use. Default is "pair_score". + sentence1_column (str): Name of the column containing the first sentence. Default is "sentence1". + sentence2_column (str): Name of the column containing the second sentence. Default is "sentence2". + sentence3_column (Optional[str]): Name of the column containing the third sentence (if applicable). Default is None. + target_column (Optional[str]): Name of the column containing the target variable. Default is None. + """ + data_path: str = Field(None, title="Data path") model: str = Field("microsoft/mpnet-base", title="Model name") lr: float = Field(3e-5, title="Learning rate") diff --git a/src/autotrain/trainers/sent_transformers/utils.py b/src/autotrain/trainers/sent_transformers/utils.py index 5940b32eda..57cc626d44 100644 --- a/src/autotrain/trainers/sent_transformers/utils.py +++ b/src/autotrain/trainers/sent_transformers/utils.py @@ -60,6 +60,26 @@ def process_columns(data, config): + """ + Processes and renames columns in the dataset based on the trainer type specified in the configuration. + + Args: + data (Dataset): The dataset containing the columns to be processed. + config (Config): Configuration object containing the trainer type and column names. + + Returns: + Dataset: The dataset with renamed columns as per the trainer type. + + Raises: + ValueError: If the trainer type specified in the configuration is invalid. + + Trainer Types and Corresponding Columns: + - "pair": Renames columns to "anchor" and "positive". + - "pair_class": Renames columns to "premise", "hypothesis", and "label". + - "pair_score": Renames columns to "sentence1", "sentence2", and "score". + - "triplet": Renames columns to "anchor", "positive", and "negative". + - "qa": Renames columns to "query" and "answer". + """ # trainers: pair, pair_class, pair_score, triplet, qa # pair: anchor, positive # pair_class: premise, hypothesis, label @@ -103,6 +123,16 @@ def process_columns(data, config): def create_model_card(config, trainer): + """ + Generates a model card string based on the provided configuration and trainer. + + Args: + config (object): Configuration object containing model and dataset details. + trainer (object): Trainer object used to evaluate the model. + + Returns: + str: A formatted model card string containing dataset information, validation metrics, and base model details. + """ if config.valid_split is not None: eval_scores = trainer.evaluate() logger.info(eval_scores) diff --git a/src/autotrain/trainers/seq2seq/dataset.py b/src/autotrain/trainers/seq2seq/dataset.py index f0ca839afe..15186873b0 100644 --- a/src/autotrain/trainers/seq2seq/dataset.py +++ b/src/autotrain/trainers/seq2seq/dataset.py @@ -1,4 +1,24 @@ class Seq2SeqDataset: + """ + A dataset class for sequence-to-sequence tasks. + + Args: + data (list): The dataset containing input and target sequences. + tokenizer (PreTrainedTokenizer): The tokenizer to process the text data. + config (object): Configuration object containing dataset parameters. + + Attributes: + data (list): The dataset containing input and target sequences. + tokenizer (PreTrainedTokenizer): The tokenizer to process the text data. + config (object): Configuration object containing dataset parameters. + max_len_input (int): Maximum length for input sequences. + max_len_target (int): Maximum length for target sequences. + + Methods: + __len__(): Returns the number of samples in the dataset. + __getitem__(item): Returns the tokenized input and target sequences for a given index. + """ + def __init__(self, data, tokenizer, config): self.data = data self.tokenizer = tokenizer diff --git a/src/autotrain/trainers/seq2seq/params.py b/src/autotrain/trainers/seq2seq/params.py index c11dfeb29c..0683230946 100644 --- a/src/autotrain/trainers/seq2seq/params.py +++ b/src/autotrain/trainers/seq2seq/params.py @@ -6,6 +6,48 @@ class Seq2SeqParams(AutoTrainParams): + """ + Seq2SeqParams is a configuration class for sequence-to-sequence training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the model to be used. Default is "google/flan-t5-base". + username (Optional[str]): Hugging Face Username. + seed (int): Random seed for reproducibility. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. + project_name (str): Name of the project or output directory. Default is "project-name". + token (Optional[str]): Hub Token for authentication. + push_to_hub (bool): Whether to push the model to the Hugging Face Hub. Default is False. + text_column (str): Name of the text column in the dataset. Default is "text". + target_column (str): Name of the target text column in the dataset. Default is "target". + lr (float): Learning rate for training. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + max_seq_length (int): Maximum sequence length for input text. Default is 128. + max_target_length (int): Maximum sequence length for target text. Default is 128. + batch_size (int): Training batch size. Default is 2. + warmup_ratio (float): Proportion of warmup steps. Default is 0.1. + gradient_accumulation (int): Number of gradient accumulation steps. Default is 1. + optimizer (str): Optimizer to be used. Default is "adamw_torch". + scheduler (str): Learning rate scheduler to be used. Default is "linear". + weight_decay (float): Weight decay for the optimizer. Default is 0.0. + max_grad_norm (float): Maximum gradient norm for clipping. Default is 1.0. + logging_steps (int): Number of steps between logging. Default is -1 (disabled). + eval_strategy (str): Evaluation strategy. Default is "epoch". + auto_find_batch_size (bool): Whether to automatically find the batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision training mode (fp16, bf16, or None). + save_total_limit (int): Maximum number of checkpoints to save. Default is 1. + peft (bool): Whether to use Parameter-Efficient Fine-Tuning (PEFT). Default is False. + quantization (Optional[str]): Quantization mode (int4, int8, or None). Default is "int8". + lora_r (int): LoRA-R parameter for PEFT. Default is 16. + lora_alpha (int): LoRA-Alpha parameter for PEFT. Default is 32. + lora_dropout (float): LoRA-Dropout parameter for PEFT. Default is 0.05. + target_modules (str): Target modules for PEFT. Default is "all-linear". + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Patience for early stopping. Default is 5. + early_stopping_threshold (float): Threshold for early stopping. Default is 0.01. + """ + data_path: str = Field(None, title="Data path") model: str = Field("google/flan-t5-base", title="Model name") username: Optional[str] = Field(None, title="Hugging Face Username") diff --git a/src/autotrain/trainers/seq2seq/utils.py b/src/autotrain/trainers/seq2seq/utils.py index cc43be2c29..2a4471cf3a 100644 --- a/src/autotrain/trainers/seq2seq/utils.py +++ b/src/autotrain/trainers/seq2seq/utils.py @@ -26,6 +26,18 @@ def _seq2seq_metrics(pred, tokenizer): + """ + Compute sequence-to-sequence metrics for predictions and labels. + + Args: + pred (tuple): A tuple containing predictions and labels. + Predictions and labels are expected to be token IDs. + tokenizer (PreTrainedTokenizer): The tokenizer used for decoding the predictions and labels. + + Returns: + dict: A dictionary containing the computed ROUGE metrics and the average length of the generated sequences. + The keys are the metric names and the values are the corresponding scores rounded to four decimal places. + """ predictions, labels = pred decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) @@ -45,6 +57,20 @@ def _seq2seq_metrics(pred, tokenizer): def create_model_card(config, trainer): + """ + Generates a model card string based on the provided configuration and trainer. + + Args: + config (object): Configuration object containing the following attributes: + - valid_split (optional): If not None, the function will include evaluation scores. + - data_path (str): Path to the dataset. + - project_name (str): Name of the project. + - model (str): Path or identifier of the model. + trainer (object): Trainer object with an `evaluate` method that returns evaluation metrics. + + Returns: + str: A formatted model card string containing dataset information, validation metrics, and base model details. + """ if config.valid_split is not None: eval_scores = trainer.evaluate() eval_scores = [f"{k[len('eval_'):]}: {v}" for k, v in eval_scores.items()] diff --git a/src/autotrain/trainers/tabular/__main__.py b/src/autotrain/trainers/tabular/__main__.py index 15bd78d264..862eb222dd 100644 --- a/src/autotrain/trainers/tabular/__main__.py +++ b/src/autotrain/trainers/tabular/__main__.py @@ -26,6 +26,24 @@ def parse_args(): def optimize(trial, model_name, xtrain, xvalid, ytrain, yvalid, eval_metric, task, preprocessor): + """ + Optimize the model based on the given trial and parameters. + + Parameters: + trial (dict or optuna.trial.Trial): The trial object or dictionary containing hyperparameters. + model_name (str): The name of the model to be used (e.g., "xgboost"). + xtrain (pd.DataFrame or np.ndarray): Training features. + xvalid (pd.DataFrame or np.ndarray): Validation features. + ytrain (pd.Series or np.ndarray): Training labels. + yvalid (pd.Series or np.ndarray): Validation labels. + eval_metric (str): The evaluation metric to be used for optimization. + task (str): The type of task (e.g., "binary_classification", "multi_class_classification", "single_column_regression"). + preprocessor (object): The preprocessor object to be applied to the data. + + Returns: + float or tuple: If trial is a dictionary, returns a tuple containing the models, preprocessor, and metric dictionary. + Otherwise, returns the loss value based on the evaluation metric. + """ if isinstance(trial, dict): params = trial else: @@ -127,6 +145,29 @@ def optimize(trial, model_name, xtrain, xvalid, ytrain, yvalid, eval_metric, tas @monitor def train(config): + """ + Train a tabular model based on the provided configuration. + + Args: + config (dict or TabularParams): Configuration parameters for training. If a dictionary is provided, it will be converted to a TabularParams object. + + Raises: + Exception: If `valid_data` is None, indicating that a valid split for tabular training was not provided. + + The function performs the following steps: + 1. Loads the training and validation datasets from disk or a specified data path. + 2. Identifies and processes categorical and numerical columns. + 3. Encodes target columns for classification tasks. + 4. Constructs preprocessing pipelines for numerical and categorical data. + 5. Determines the sub-task (e.g., binary classification, multi-class classification, regression). + 6. Optimizes the model using Optuna for hyperparameter tuning. + 7. Saves the best model and target encoders to disk. + 8. Creates and saves a model card. + 9. Optionally pushes the model to the Hugging Face Hub. + + Note: + The function expects the configuration to contain various parameters such as `data_path`, `train_split`, `valid_split`, `categorical_columns`, `numerical_columns`, `model`, `task`, `num_trials`, `time_limit`, `project_name`, `token`, `username`, and `push_to_hub`. + """ if isinstance(config, dict): config = TabularParams(**config) diff --git a/src/autotrain/trainers/tabular/params.py b/src/autotrain/trainers/tabular/params.py index be124ce55b..ed553cba31 100644 --- a/src/autotrain/trainers/tabular/params.py +++ b/src/autotrain/trainers/tabular/params.py @@ -6,6 +6,31 @@ class TabularParams(AutoTrainParams): + """ + TabularParams is a configuration class for tabular data training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the model to use. Default is "xgboost". + username (Optional[str]): Hugging Face Username. + seed (int): Random seed for reproducibility. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. + project_name (str): Name of the output directory. Default is "project-name". + token (Optional[str]): Hub Token for authentication. + push_to_hub (bool): Whether to push the model to the hub. Default is False. + id_column (str): Name of the ID column. Default is "id". + target_columns (Union[List[str], str]): Target column(s) in the dataset. Default is ["target"]. + categorical_columns (Optional[List[str]]): List of categorical columns. + numerical_columns (Optional[List[str]]): List of numerical columns. + task (str): Type of task (e.g., "classification"). Default is "classification". + num_trials (int): Number of trials for hyperparameter optimization. Default is 10. + time_limit (int): Time limit for training in seconds. Default is 600. + categorical_imputer (Optional[str]): Imputer strategy for categorical columns. + numerical_imputer (Optional[str]): Imputer strategy for numerical columns. + numeric_scaler (Optional[str]): Scaler strategy for numerical columns. + """ + data_path: str = Field(None, title="Data path") model: str = Field("xgboost", title="Model name") username: Optional[str] = Field(None, title="Hugging Face Username") diff --git a/src/autotrain/trainers/tabular/utils.py b/src/autotrain/trainers/tabular/utils.py index 788a268535..11e7d87c13 100644 --- a/src/autotrain/trainers/tabular/utils.py +++ b/src/autotrain/trainers/tabular/utils.py @@ -87,6 +87,42 @@ @dataclass class TabularMetrics: + """ + A class to calculate various metrics for different types of tabular tasks. + + Attributes: + ----------- + sub_task : str + The type of sub-task. It can be one of the following: + - "binary_classification" + - "multi_class_classification" + - "single_column_regression" + - "multi_column_regression" + - "multi_label_classification" + labels : Optional[List], optional + The list of labels for multi-class classification tasks (default is None). + + Methods: + -------- + __post_init__(): + Initializes the valid metrics based on the sub-task type. + + calculate(y_true, y_pred): + Calculates the metrics based on the true and predicted values. + + Parameters: + ----------- + y_true : array-like + True labels or values. + y_pred : array-like + Predicted labels or values. + + Returns: + -------- + dict + A dictionary with metric names as keys and their calculated values as values. + """ + sub_task: str labels: Optional[List] = None @@ -167,6 +203,28 @@ def calculate(self, y_true, y_pred): class TabularModel: + """ + A class used to represent a Tabular Model for AutoTrain training. + + Attributes + ---------- + model : str + The name of the model to be used. + preprocessor : object + The preprocessor to be applied to the data. + sub_task : str + The sub-task type, either classification or regression. + params : dict + The parameters to be passed to the model. + use_predict_proba : bool + A flag indicating whether to use the predict_proba method. + + Methods + ------- + _get_model(): + Retrieves the appropriate model based on the sub-task and model name. + """ + def __init__(self, model, preprocessor, sub_task, params): self.model = model self.preprocessor = preprocessor @@ -349,6 +407,23 @@ def get_params(trial, model, task): def get_imputer(imputer_name): + """ + Returns an imputer object based on the specified imputer name. + + Parameters: + imputer_name (str): The name of the imputer to use. Can be one of the following: + - "median": Uses the median value for imputation. + - "mean": Uses the mean value for imputation. + - "most_frequent": Uses the most frequent value for imputation. + If None, returns None. + + Returns: + impute.SimpleImputer or None: An instance of SimpleImputer with the specified strategy, + or None if imputer_name is None. + + Raises: + ValueError: If an invalid imputer_name is provided. + """ if imputer_name is None: return None if imputer_name == "median": @@ -361,6 +436,21 @@ def get_imputer(imputer_name): def get_scaler(scaler_name): + """ + Returns a scaler object based on the provided scaler name. + + Parameters: + scaler_name (str): The name of the scaler to be returned. + Possible values are "standard", "minmax", "robust", and "normal". + If None, returns None. + + Returns: + scaler: An instance of the corresponding scaler from sklearn.preprocessing. + If the scaler_name is None, returns None. + + Raises: + ValueError: If the scaler_name is not one of the expected values. + """ if scaler_name is None: return None if scaler_name == "standard": @@ -375,6 +465,25 @@ def get_scaler(scaler_name): def get_metric_direction(sub_task): + """ + Determines the appropriate metric and its optimization direction based on the given sub-task. + + Parameters: + sub_task (str): The type of sub-task. Must be one of the following: + - "binary_classification" + - "multi_class_classification" + - "single_column_regression" + - "multi_label_classification" + - "multi_column_regression" + + Returns: + tuple: A tuple containing: + - str: The metric to be used (e.g., "logloss", "mlogloss", "rmse"). + - str: The direction of optimization ("minimize"). + + Raises: + ValueError: If the provided sub_task is not one of the recognized types. + """ if sub_task == "binary_classification": return "logloss", "minimize" if sub_task == "multi_class_classification": @@ -389,14 +498,44 @@ def get_metric_direction(sub_task): def get_categorical_columns(df): + """ + Extracts the names of categorical columns from a DataFrame. + + Parameters: + df (pandas.DataFrame): The DataFrame from which to extract categorical columns. + + Returns: + list: A list of column names that are of categorical data type (either 'category' or 'object'). + """ return list(df.select_dtypes(include=["category", "object"]).columns) def get_numerical_columns(df): + """ + Extracts and returns a list of numerical column names from a given DataFrame. + + Args: + df (pandas.DataFrame): The DataFrame from which to extract numerical columns. + + Returns: + list: A list of column names that have numerical data types. + """ return list(df.select_dtypes(include=["number"]).columns) def create_model_card(config, sub_task, best_params, best_metrics): + """ + Generates a markdown formatted model card with the given configuration, sub-task, best parameters, and best metrics. + + Args: + config (object): Configuration object containing task and data path information. + sub_task (str): The specific sub-task for which the model card is being created. + best_params (dict): Dictionary containing the best hyperparameters for the model. + best_metrics (dict): Dictionary containing the best performance metrics for the model. + + Returns: + str: A string containing the formatted model card in markdown. + """ best_metrics = "\n".join([f"- {k}: {v}" for k, v in best_metrics.items()]) best_params = "\n".join([f"- {k}: {v}" for k, v in best_params.items()]) return MARKDOWN.format( diff --git a/src/autotrain/trainers/text_classification/dataset.py b/src/autotrain/trainers/text_classification/dataset.py index 6675d6c994..b14303e741 100644 --- a/src/autotrain/trainers/text_classification/dataset.py +++ b/src/autotrain/trainers/text_classification/dataset.py @@ -2,6 +2,26 @@ class TextClassificationDataset: + """ + A dataset class for text classification tasks. + + Args: + data (list): The dataset containing text and target columns. + tokenizer (PreTrainedTokenizer): The tokenizer to preprocess the text data. + config (object): Configuration object containing dataset parameters. + + Attributes: + data (list): The dataset containing text and target columns. + tokenizer (PreTrainedTokenizer): The tokenizer to preprocess the text data. + config (object): Configuration object containing dataset parameters. + text_column (str): The name of the column containing text data. + target_column (str): The name of the column containing target labels. + + Methods: + __len__(): Returns the number of samples in the dataset. + __getitem__(item): Returns a dictionary containing tokenized input ids, attention mask, token type ids (if available), and target labels for the given item index. + """ + def __init__(self, data, tokenizer, config): self.data = data self.tokenizer = tokenizer diff --git a/src/autotrain/trainers/text_classification/params.py b/src/autotrain/trainers/text_classification/params.py index c38f1d6c72..b03758adad 100644 --- a/src/autotrain/trainers/text_classification/params.py +++ b/src/autotrain/trainers/text_classification/params.py @@ -6,6 +6,41 @@ class TextClassificationParams(AutoTrainParams): + """ + [`TextClassificationParams`] is a configuration class for text classification training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the model to use. Default is "bert-base-uncased". + lr (float): Learning rate. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + max_seq_length (int): Maximum sequence length. Default is 128. + batch_size (int): Training batch size. Default is 8. + warmup_ratio (float): Warmup proportion. Default is 0.1. + gradient_accumulation (int): Number of gradient accumulation steps. Default is 1. + optimizer (str): Optimizer to use. Default is "adamw_torch". + scheduler (str): Scheduler to use. Default is "linear". + weight_decay (float): Weight decay. Default is 0.0. + max_grad_norm (float): Maximum gradient norm. Default is 1.0. + seed (int): Random seed. Default is 42. + train_split (str): Name of the training split. Default is "train". + valid_split (Optional[str]): Name of the validation split. Default is None. + text_column (str): Name of the text column in the dataset. Default is "text". + target_column (str): Name of the target column in the dataset. Default is "target". + logging_steps (int): Number of steps between logging. Default is -1. + project_name (str): Name of the project. Default is "project-name". + auto_find_batch_size (bool): Whether to automatically find the batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision setting (fp16, bf16, or None). Default is None. + save_total_limit (int): Total number of checkpoints to save. Default is 1. + token (Optional[str]): Hub token for authentication. Default is None. + push_to_hub (bool): Whether to push the model to the hub. Default is False. + eval_strategy (str): Evaluation strategy. Default is "epoch". + username (Optional[str]): Hugging Face username. Default is None. + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Number of epochs with no improvement after which training will be stopped. Default is 5. + early_stopping_threshold (float): Threshold for measuring the new optimum to continue training. Default is 0.01. + """ + data_path: str = Field(None, title="Data path") model: str = Field("bert-base-uncased", title="Model name") lr: float = Field(5e-5, title="Learning rate") diff --git a/src/autotrain/trainers/text_classification/utils.py b/src/autotrain/trainers/text_classification/utils.py index aaa16b341b..eedbc4e3ca 100644 --- a/src/autotrain/trainers/text_classification/utils.py +++ b/src/autotrain/trainers/text_classification/utils.py @@ -47,6 +47,22 @@ def _binary_classification_metrics(pred): + """ + Calculate various binary classification metrics. + + Args: + pred (tuple): A tuple containing raw predictions and true labels. + - raw_predictions (numpy.ndarray): The raw prediction scores from the model. + - labels (numpy.ndarray): The true labels. + + Returns: + dict: A dictionary containing the following metrics: + - "f1" (float): The F1 score. + - "precision" (float): The precision score. + - "recall" (float): The recall score. + - "auc" (float): The Area Under the ROC Curve (AUC) score. + - "accuracy" (float): The accuracy score. + """ raw_predictions, labels = pred predictions = np.argmax(raw_predictions, axis=1) result = { @@ -60,6 +76,27 @@ def _binary_classification_metrics(pred): def _multi_class_classification_metrics(pred): + """ + Compute various classification metrics for multi-class classification. + + Args: + pred (tuple): A tuple containing raw predictions and true labels. + - raw_predictions (numpy.ndarray): The raw prediction scores for each class. + - labels (numpy.ndarray): The true labels. + + Returns: + dict: A dictionary containing the following metrics: + - "f1_macro": F1 score with macro averaging. + - "f1_micro": F1 score with micro averaging. + - "f1_weighted": F1 score with weighted averaging. + - "precision_macro": Precision score with macro averaging. + - "precision_micro": Precision score with micro averaging. + - "precision_weighted": Precision score with weighted averaging. + - "recall_macro": Recall score with macro averaging. + - "recall_micro": Recall score with micro averaging. + - "recall_weighted": Recall score with weighted averaging. + - "accuracy": Accuracy score. + """ raw_predictions, labels = pred predictions = np.argmax(raw_predictions, axis=1) results = { @@ -78,6 +115,17 @@ def _multi_class_classification_metrics(pred): def create_model_card(config, trainer, num_classes): + """ + Generates a model card for a text classification model. + + Args: + config (object): Configuration object containing various settings and paths. + trainer (object): Trainer object used for evaluating the model. + num_classes (int): Number of classes in the classification task. + + Returns: + str: A formatted string representing the model card. + """ if config.valid_split is not None: eval_scores = trainer.evaluate() valid_metrics = ( @@ -108,6 +156,19 @@ def create_model_card(config, trainer, num_classes): def pause_endpoint(params): + """ + Pauses a Hugging Face endpoint using the provided parameters. + + This function constructs an API URL using the endpoint ID from the environment + variables, and sends a POST request to pause the specified endpoint. + + Args: + params (object): An object containing the following attribute: + - token (str): The authorization token required to authenticate the API request. + + Returns: + dict: The JSON response from the API call. + """ endpoint_id = os.environ["ENDPOINT_ID"] username = endpoint_id.split("/")[0] project_name = endpoint_id.split("/")[1] diff --git a/src/autotrain/trainers/text_regression/dataset.py b/src/autotrain/trainers/text_regression/dataset.py index 6a682b3a7c..abd54854ed 100644 --- a/src/autotrain/trainers/text_regression/dataset.py +++ b/src/autotrain/trainers/text_regression/dataset.py @@ -2,6 +2,27 @@ class TextRegressionDataset: + """ + A custom dataset class for text regression tasks for AutoTrain. + + Args: + data (list of dict): The dataset containing text and target values. + tokenizer (PreTrainedTokenizer): The tokenizer to preprocess the text data. + config (object): Configuration object containing dataset parameters. + + Attributes: + data (list of dict): The dataset containing text and target values. + tokenizer (PreTrainedTokenizer): The tokenizer to preprocess the text data. + config (object): Configuration object containing dataset parameters. + text_column (str): The column name for text data in the dataset. + target_column (str): The column name for target values in the dataset. + max_len (int): The maximum sequence length for tokenized inputs. + + Methods: + __len__(): Returns the number of samples in the dataset. + __getitem__(item): Returns a dictionary containing tokenized inputs and target value for a given index. + """ + def __init__(self, data, tokenizer, config): self.data = data self.tokenizer = tokenizer diff --git a/src/autotrain/trainers/text_regression/params.py b/src/autotrain/trainers/text_regression/params.py index b469d6bcf2..9c920a1b64 100644 --- a/src/autotrain/trainers/text_regression/params.py +++ b/src/autotrain/trainers/text_regression/params.py @@ -6,6 +6,41 @@ class TextRegressionParams(AutoTrainParams): + """ + TextRegressionParams is a configuration class for setting up text regression training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the pre-trained model to use. Default is "bert-base-uncased". + lr (float): Learning rate for the optimizer. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + max_seq_length (int): Maximum sequence length for the inputs. Default is 128. + batch_size (int): Batch size for training. Default is 8. + warmup_ratio (float): Proportion of training to perform learning rate warmup. Default is 0.1. + gradient_accumulation (int): Number of steps to accumulate gradients before updating. Default is 1. + optimizer (str): Optimizer to use. Default is "adamw_torch". + scheduler (str): Learning rate scheduler to use. Default is "linear". + weight_decay (float): Weight decay to apply. Default is 0.0. + max_grad_norm (float): Maximum norm for the gradients. Default is 1.0. + seed (int): Random seed for reproducibility. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. Default is None. + text_column (str): Name of the column containing text data. Default is "text". + target_column (str): Name of the column containing target data. Default is "target". + logging_steps (int): Number of steps between logging. Default is -1 (no logging). + project_name (str): Name of the project for output directory. Default is "project-name". + auto_find_batch_size (bool): Whether to automatically find the batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision training mode (fp16, bf16, or None). Default is None. + save_total_limit (int): Maximum number of checkpoints to save. Default is 1. + token (Optional[str]): Token for accessing Hugging Face Hub. Default is None. + push_to_hub (bool): Whether to push the model to Hugging Face Hub. Default is False. + eval_strategy (str): Evaluation strategy to use. Default is "epoch". + username (Optional[str]): Hugging Face username. Default is None. + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Number of epochs with no improvement after which training will be stopped. Default is 5. + early_stopping_threshold (float): Threshold for measuring the new optimum, to qualify as an improvement. Default is 0.01. + """ + data_path: str = Field(None, title="Data path") model: str = Field("bert-base-uncased", title="Model name") lr: float = Field(5e-5, title="Learning rate") diff --git a/src/autotrain/trainers/text_regression/utils.py b/src/autotrain/trainers/text_regression/utils.py index 1ef20c2944..8d058a40c8 100644 --- a/src/autotrain/trainers/text_regression/utils.py +++ b/src/autotrain/trainers/text_regression/utils.py @@ -33,14 +33,27 @@ def single_column_regression_metrics(pred): + """ + Computes various regression metrics for a single column of predictions. + + Args: + pred (tuple): A tuple containing raw predictions and true labels. + The first element is an array-like of raw predictions, + and the second element is an array-like of true labels. + + Returns: + dict: A dictionary containing the computed regression metrics: + - "mse": Mean Squared Error + - "mae": Mean Absolute Error + - "r2": R-squared Score + - "rmse": Root Mean Squared Error + - "explained_variance": Explained Variance Score + + Notes: + If any metric computation fails, the function will return a default value of -999 for that metric. + """ raw_predictions, labels = pred - # try: - # raw_predictions = [r for preds in raw_predictions for r in preds] - # except TypeError as err: - # if "numpy.float32" not in str(err): - # raise Exception(err) - def safe_compute(metric_func, default=-999): try: return metric_func(labels, raw_predictions) @@ -63,6 +76,20 @@ def safe_compute(metric_func, default=-999): def create_model_card(config, trainer): + """ + Generates a model card string based on the provided configuration and trainer. + + Args: + config (object): Configuration object containing the following attributes: + - valid_split (optional): Validation split to evaluate the model. + - data_path (str): Path to the dataset. + - project_name (str): Name of the project. + - model (str): Path or identifier of the model. + trainer (object): Trainer object used to evaluate the model. + + Returns: + str: A formatted model card string containing dataset information, validation metrics, and base model details. + """ if config.valid_split is not None: eval_scores = trainer.evaluate() eval_scores = [ diff --git a/src/autotrain/trainers/token_classification/dataset.py b/src/autotrain/trainers/token_classification/dataset.py index a5f5e5daa0..907c841123 100644 --- a/src/autotrain/trainers/token_classification/dataset.py +++ b/src/autotrain/trainers/token_classification/dataset.py @@ -1,4 +1,31 @@ class TokenClassificationDataset: + """ + A dataset class for token classification tasks. + + Args: + data (Dataset): The dataset containing the text and tags. + tokenizer (PreTrainedTokenizer): The tokenizer to be used for tokenizing the text. + config (Config): Configuration object containing necessary parameters. + + Attributes: + data (Dataset): The dataset containing the text and tags. + tokenizer (PreTrainedTokenizer): The tokenizer to be used for tokenizing the text. + config (Config): Configuration object containing necessary parameters. + + Methods: + __len__(): + Returns the number of samples in the dataset. + + __getitem__(item): + Retrieves a tokenized sample and its corresponding labels. + + Args: + item (int): The index of the sample to retrieve. + + Returns: + dict: A dictionary containing tokenized text and corresponding labels. + """ + def __init__(self, data, tokenizer, config): self.data = data self.tokenizer = tokenizer diff --git a/src/autotrain/trainers/token_classification/params.py b/src/autotrain/trainers/token_classification/params.py index 8fc02337c2..7ad22295ed 100644 --- a/src/autotrain/trainers/token_classification/params.py +++ b/src/autotrain/trainers/token_classification/params.py @@ -6,6 +6,41 @@ class TokenClassificationParams(AutoTrainParams): + """ + TokenClassificationParams is a configuration class for token classification training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the model to use. Default is "bert-base-uncased". + lr (float): Learning rate. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + max_seq_length (int): Maximum sequence length. Default is 128. + batch_size (int): Training batch size. Default is 8. + warmup_ratio (float): Warmup proportion. Default is 0.1. + gradient_accumulation (int): Gradient accumulation steps. Default is 1. + optimizer (str): Optimizer to use. Default is "adamw_torch". + scheduler (str): Scheduler to use. Default is "linear". + weight_decay (float): Weight decay. Default is 0.0. + max_grad_norm (float): Maximum gradient norm. Default is 1.0. + seed (int): Random seed. Default is 42. + train_split (str): Name of the training split. Default is "train". + valid_split (Optional[str]): Name of the validation split. Default is None. + tokens_column (str): Name of the tokens column. Default is "tokens". + tags_column (str): Name of the tags column. Default is "tags". + logging_steps (int): Number of steps between logging. Default is -1. + project_name (str): Name of the project. Default is "project-name". + auto_find_batch_size (bool): Whether to automatically find the batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision setting (fp16, bf16, or None). Default is None. + save_total_limit (int): Total number of checkpoints to save. Default is 1. + token (Optional[str]): Hub token for authentication. Default is None. + push_to_hub (bool): Whether to push the model to the Hugging Face hub. Default is False. + eval_strategy (str): Evaluation strategy. Default is "epoch". + username (Optional[str]): Hugging Face username. Default is None. + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Patience for early stopping. Default is 5. + early_stopping_threshold (float): Threshold for early stopping. Default is 0.01. + """ + data_path: str = Field(None, title="Data path") model: str = Field("bert-base-uncased", title="Model name") lr: float = Field(5e-5, title="Learning rate") diff --git a/src/autotrain/trainers/token_classification/utils.py b/src/autotrain/trainers/token_classification/utils.py index df273ba722..e5f1fdda8d 100644 --- a/src/autotrain/trainers/token_classification/utils.py +++ b/src/autotrain/trainers/token_classification/utils.py @@ -23,6 +23,22 @@ def token_classification_metrics(pred, label_list): + """ + Compute token classification metrics including precision, recall, F1 score, and accuracy. + + Args: + pred (tuple): A tuple containing predictions and labels. + Predictions should be a 3D array (batch_size, sequence_length, num_labels). + Labels should be a 2D array (batch_size, sequence_length). + label_list (list): A list of label names corresponding to the indices used in predictions and labels. + + Returns: + dict: A dictionary containing the following metrics: + - "precision": Precision score of the token classification. + - "recall": Recall score of the token classification. + - "f1": F1 score of the token classification. + - "accuracy": Accuracy score of the token classification. + """ predictions, labels = pred predictions = np.argmax(predictions, axis=2) @@ -45,6 +61,16 @@ def token_classification_metrics(pred, label_list): def create_model_card(config, trainer): + """ + Generates a model card string based on the provided configuration and trainer. + + Args: + config (object): Configuration object containing model and dataset information. + trainer (object): Trainer object used to evaluate the model. + + Returns: + str: A formatted model card string with dataset tags, validation metrics, and base model information. + """ if config.valid_split is not None: eval_scores = trainer.evaluate() valid_metrics = ["eval_loss", "eval_precision", "eval_recall", "eval_f1", "eval_accuracy"] diff --git a/src/autotrain/trainers/vlm/params.py b/src/autotrain/trainers/vlm/params.py index fa936ed3ef..5e41e25ea0 100644 --- a/src/autotrain/trainers/vlm/params.py +++ b/src/autotrain/trainers/vlm/params.py @@ -6,6 +6,53 @@ class VLMTrainingParams(AutoTrainParams): + """ + VLMTrainingParams + + Attributes: + model (str): Model name. Default is "google/paligemma-3b-pt-224". + project_name (str): Output directory. Default is "project-name". + + data_path (str): Data path. Default is "data". + train_split (str): Train data config. Default is "train". + valid_split (Optional[str]): Validation data config. Default is None. + + trainer (str): Trainer type (captioning, vqa, segmentation, detection). Default is "vqa". + log (str): Logging using experiment tracking. Default is "none". + disable_gradient_checkpointing (bool): Gradient checkpointing. Default is False. + logging_steps (int): Logging steps. Default is -1. + eval_strategy (str): Evaluation strategy. Default is "epoch". + save_total_limit (int): Save total limit. Default is 1. + auto_find_batch_size (bool): Auto find batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision (fp16, bf16, or None). Default is None. + lr (float): Learning rate. Default is 3e-5. + epochs (int): Number of training epochs. Default is 1. + batch_size (int): Training batch size. Default is 2. + warmup_ratio (float): Warmup proportion. Default is 0.1. + gradient_accumulation (int): Gradient accumulation steps. Default is 4. + optimizer (str): Optimizer. Default is "adamw_torch". + scheduler (str): Scheduler. Default is "linear". + weight_decay (float): Weight decay. Default is 0.0. + max_grad_norm (float): Max gradient norm. Default is 1.0. + seed (int): Seed. Default is 42. + + quantization (Optional[str]): Quantization (int4, int8, or None). Default is "int4". + target_modules (Optional[str]): Target modules. Default is "all-linear". + merge_adapter (bool): Merge adapter. Default is False. + peft (bool): Use PEFT. Default is False. + lora_r (int): Lora r. Default is 16. + lora_alpha (int): Lora alpha. Default is 32. + lora_dropout (float): Lora dropout. Default is 0.05. + + image_column (Optional[str]): Image column. Default is "image". + text_column (str): Text (answer) column. Default is "text". + prompt_text_column (Optional[str]): Prompt (prefix) column. Default is "prompt". + + push_to_hub (bool): Push to hub. Default is False. + username (Optional[str]): Hugging Face Username. Default is None. + token (Optional[str]): Huggingface token. Default is None. + """ + model: str = Field("google/paligemma-3b-pt-224", title="Model name") project_name: str = Field("project-name", title="Output directory") diff --git a/src/autotrain/utils.py b/src/autotrain/utils.py index a8a6e5a89b..fe1a3306d3 100644 --- a/src/autotrain/utils.py +++ b/src/autotrain/utils.py @@ -23,6 +23,21 @@ def run_training(params, task_id, local=False, wait=False): + """ + Run the training process based on the provided parameters and task ID. + + Args: + params (str): JSON string of the parameters required for training. + task_id (int): Identifier for the type of task to be performed. + local (bool, optional): Flag to indicate if the training should be run locally. Defaults to False. + wait (bool, optional): Flag to indicate if the function should wait for the process to complete. Defaults to False. + + Returns: + int: Process ID of the launched training process. + + Raises: + NotImplementedError: If the task_id does not match any of the predefined tasks. + """ params = json.loads(params) if isinstance(params, str): params = json.loads(params)