Skip to content

Commit

Permalink
Merge branch 'master' of github.com:ludwig-ai/ludwig into local_path_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
justinxzhao committed Sep 15, 2023
2 parents a4f5adc + c6964f0 commit fddd82d
Show file tree
Hide file tree
Showing 38 changed files with 555 additions and 64 deletions.
75 changes: 66 additions & 9 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ jobs:
- name: Setup Linux
if: runner.os == 'linux'
run: |
sudo apt-get install -y cmake libsndfile1 wget
sudo apt-get update && sudo apt-get install -y cmake libsndfile1 wget
- name: Setup macOS
if: runner.os == 'macOS'
Expand Down Expand Up @@ -230,7 +230,7 @@ jobs:
- name: Setup Linux
if: runner.os == 'linux'
run: |
sudo apt-get install -y cmake libsndfile1
sudo apt-get update && sudo apt-get install -y cmake libsndfile1
- name: Setup macOS
if: runner.os == 'macOS'
Expand Down Expand Up @@ -287,7 +287,7 @@ jobs:
- name: Setup Linux
if: runner.os == 'linux'
run: |
sudo apt-get install -y cmake libsndfile1
sudo apt-get update && sudo apt-get install -y cmake libsndfile1
- name: Setup macOS
if: runner.os == 'macOS'
Expand Down Expand Up @@ -344,7 +344,7 @@ jobs:
- name: Setup Linux
if: runner.os == 'linux'
run: |
sudo apt-get install -y cmake libsndfile1
sudo apt-get update && sudo apt-get install -y cmake libsndfile1
- name: Setup macOS
if: runner.os == 'macOS'
Expand Down Expand Up @@ -401,7 +401,7 @@ jobs:
- name: Setup Linux
if: runner.os == 'linux'
run: |
sudo apt-get install -y cmake libsndfile1
sudo apt-get update && sudo apt-get install -y cmake libsndfile1
- name: Setup macOS
if: runner.os == 'macOS'
Expand All @@ -425,7 +425,64 @@ jobs:

- name: Integration Tests (D)
run: |
RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=7200 pytest -v --timeout 300 --durations 100 -m "not slow and not combinatorial and not horovod and not llm and not integration_tests_a and not integration_tests_b and not integration_tests_c" --junitxml pytest.xml tests/integration_tests
RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=7200 pytest -v --timeout 300 --durations 100 -m "not slow and not combinatorial and not horovod and not llm and integration_tests_d" --junitxml pytest.xml tests/integration_tests
integration-tests-e:
name: Integration Tests (E)
runs-on: ubuntu-latest

env:
AWS_ACCESS_KEY_ID: ${{ secrets.LUDWIG_TESTS_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.LUDWIG_TESTS_AWS_SECRET_ACCESS_KEY }}
KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
IS_NOT_FORK: ${{ !(github.event.pull_request.base.repo.full_name == 'ludwig-ai/ludwig' && github.event.pull_request.head.repo.fork) }}

services:
minio:
image: fclairamb/minio-github-actions
env:
MINIO_ACCESS_KEY: minio
MINIO_SECRET_KEY: minio123
ports:
- 9000:9000

timeout-minutes: 90
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.9

- name: Setup Linux
if: runner.os == 'linux'
run: |
sudo apt-get update && sudo apt-get install -y cmake libsndfile1
- name: Setup macOS
if: runner.os == 'macOS'
run: |
brew install libuv
- name: Install dependencies
run: |
python --version
pip --version
python -m pip install -U pip
# remove torch and ray from the dependencies so we can add them depending on the matrix args for the job.
cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt
cat requirements_distributed.txt | sed '/^ray[\[]/d'
pip install torch==2.0.0 torchtext torchvision torchaudio
pip install ray==2.3.0
pip install '.[test]'
pip list
shell: bash

- name: Integration Tests (E)
run: |
RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=7200 pytest -v --timeout 300 --durations 100 -m "not slow and not combinatorial and not horovod and not llm and not integration_tests_a and not integration_tests_b and not integration_tests_c and not integration_tests_d" --junitxml pytest.xml tests/integration_tests
llm-tests:
name: LLM Tests
Expand All @@ -442,7 +499,7 @@ jobs:
- name: Setup Linux
if: runner.os == 'linux'
run: |
sudo apt-get install -y cmake libsndfile1
sudo apt-get update && sudo apt-get install -y cmake libsndfile1
- name: Setup macOS
if: runner.os == 'macOS'
Expand Down Expand Up @@ -483,7 +540,7 @@ jobs:
- name: Setup Linux
if: runner.os == 'linux'
run: |
sudo apt-get install -y cmake libsndfile1
sudo apt-get update && sudo apt-get install -y cmake libsndfile1
- name: Setup macOS
if: runner.os == 'macOS'
Expand Down Expand Up @@ -522,7 +579,7 @@ jobs:
- name: Setup Linux
if: runner.os == 'linux'
run: |
sudo apt-get install -y cmake libsndfile1
sudo apt-get update && sudo apt-get install -y cmake libsndfile1
- name: Setup macOS
if: runner.os == 'macOS'
Expand Down
11 changes: 11 additions & 0 deletions ludwig/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,17 @@ def train_online(
self.model = self._online_trainer.train_online(training_dataset)

def _tune_batch_size(self, trainer, dataset, random_seed: int = default_random_seed):
"""Sets AUTO batch-size-related parameters based on the trainer, backend type, and number of workers.
Batch-size related parameters that are set:
- trainer.batch_size
- trainer.eval_batch_size
- trainer.gradient_accumulation_steps
- trainer.effective_batch_size
The final batch size selected may be non-deterministic even with a fixed random seed since throughput-based
heuristics may be affected by resources used by other processes running on the machine.
"""
if not self.config_obj.trainer.can_tune_batch_size():
# Models like GBMs don't have batch sizes to be tuned
return
Expand Down
18 changes: 18 additions & 0 deletions ludwig/datasets/configs/code_alpaca.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
version: 1.0
name: code_alpaca
download_urls: https://raw.githubusercontent.com/sahil280114/codealpaca/master/data/code_alpaca_20k.json
train_filenames: code_alpaca_20k.json
loader: code_alpaca_loader.CodeAlpacaLoader
description: |
This dataset, created by sahil280114, aims to build and share an instruction-following LLaMA model for code generation. The repo containing
this dataset is fully based on Stanford Alpaca, and only changes the data used for training.
columns:
- name: instruction
type: text
- name: input
type: text
- name: output
type: text
output_features:
- name: output
type: text
50 changes: 50 additions & 0 deletions ludwig/datasets/configs/consumer_complaints.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
version: 1.0
name: consumer_complaints
kaggle_dataset_id: selener/consumer-complaint-database
archive_filenames: consumer-complaint-database.zip
dataset_filenames: rows.csv
loader: consumer_complaints_loader.ConsumerComplaintsLoader
description: |
The dataset contains different information of complaints that customers have made about a multiple products and
services in the financial sector, such us Credit Reports, Student Loans, Money Transfer, etc. The date of each
complaint ranges from November 2011 to May 2019.
columns:
- name: Date received
type: Date
- name: Product
type: text
- name: Sub-product
type: text
- name: Issue
type: text
- name: Sub-issue
type: text
- name: Consumer complaint narrative
type: text
- name: Company public response
type: text
- name: Company
type: text
- name: State
type: category
- name: ZIP code
type: category
- name: Tags
type: category
- name: Consumer consent provided?
type: text
- name: Submitted via
type: category
- name: Date sent to company
type: date
- name: Company response to consumer
type: text
- name: Timely response?
type: binary
- name: Consumer disputed?
type: binary
- name: Complaint ID
type: number
output_features:
- name: Issue
type: text
27 changes: 27 additions & 0 deletions ludwig/datasets/loaders/code_alpaca_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright (c) 2022 Predibase, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import pandas as pd

from ludwig.datasets.loaders.dataset_loader import DatasetLoader


class CodeAlpacaLoader(DatasetLoader):
"""The Code Alpaca dataset."""

def load_file_to_dataframe(self, file_path: str) -> pd.DataFrame:
"""Loads a file into a dataframe."""
df = pd.read_json(file_path)
return df
45 changes: 45 additions & 0 deletions ludwig/datasets/loaders/consumer_complaints_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright (c) 2022 Predibase, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pandas as pd

from ludwig.datasets.loaders.dataset_loader import DatasetLoader


class ConsumerComplaintsLoader(DatasetLoader):
"""The Consumer Complaints dataset."""

def load_file_to_dataframe(self, file_path: str) -> pd.DataFrame:
"""Loads a file into a dataframe."""

consumer_complaints_df = pd.read_csv(file_path)
consumer_complaints_df = preprocess_df(consumer_complaints_df)

return consumer_complaints_df


def preprocess_df(df):
"""Preprocesses the dataframe.
- Remove all rows with missing values in the following columns:
- Consumer complaint narrative
- Issue
- Product
Args:
df (pd.DataFrame): The dataframe to preprocess.
Returns:
pd.DataFrame: The preprocessed dataframe.
"""
return df.dropna(subset=["Consumer complaint narrative", "Issue", "Product"])
1 change: 1 addition & 0 deletions ludwig/models/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ def forward(
with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False) if (
torch.cuda.is_available() and self.curr_device.type == "cuda"
) else contextlib.nullcontext():
# TODO (jeffkinnison): Determine why the 8-bit `SCB` and `CB` matrices are deleted in the forward pass
model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS)

if self.output_feature_type != TEXT:
Expand Down
2 changes: 1 addition & 1 deletion ludwig/modules/metric_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ def __init__(self, **kwargs):
super().__init__()


@register_metric("char_error_rate", [SEQUENCE, TEXT], MINIMIZE, PREDICTIONS)
@register_metric("char_error_rate", [TEXT], MINIMIZE, RESPONSE)
class CharErrorRateMetric(CharErrorRate, LudwigMetric):
def __init__(self, **kwargs):
super().__init__()
Expand Down
12 changes: 12 additions & 0 deletions ludwig/schema/metadata/configs/trainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,18 @@ ecd:
Suggested to enable this if training is proceeding very slowly in distributed training (and GPU
utilization is low), or the batch size is very small and the loss curves look very spiky.
ui_display_name: Gradient Accumulation Steps
enable_gradient_checkpointing:
expected_impact: 2
ui_display_name: Enable Gradient Checkpointing
default_value_reasoning:
Gradient checkpointing is a technique to reduce the memory footprint of the model by
trading compute for memory. This is useful when training very large models that run into out of memory
errors very quickly during training. It is particularly helpful when doing non-quantization based training
(adapter based or full fine-tuning). Gradient checkpointing works by recomputing the activations of the
model during the backward pass, rather than storing them in memory during the forward pass.
This is a tradeoff between compute and memory, as the activations need to be recomputed during
the backward pass, but the memory footprint is reduced. This is set to false by default because
it is not always beneficial to use gradient checkpointing, and it can sometimes slow down training.
validation_field:
default_value_reasoning:
Concrete evaluation metrics are usually better than loss,
Expand Down
12 changes: 12 additions & 0 deletions ludwig/schema/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,13 @@ def __post_init__(self):
parameter_metadata=TRAINER_METADATA[MODEL_ECD]["compile"],
)

enable_gradient_checkpointing: bool = schema_utils.Boolean(
default=False,
description="Whether to enable gradient checkpointing, which trades compute for memory."
"This is useful for training very deep models with limited memory.",
parameter_metadata=TRAINER_METADATA[MODEL_ECD]["enable_gradient_checkpointing"],
)

def update_batch_size_grad_accum(self, num_workers: int):
from ludwig.utils.trainer_utils import get_rendered_batch_size_grad_accum

Expand Down Expand Up @@ -881,6 +888,11 @@ class FineTuneTrainerConfig(ECDTrainerConfig):
description="Base learning rate used for training in the LLM trainer.",
)

eval_batch_size: int = schema_utils.PositiveInteger(
default=2,
description="Batch size used for evaluation in the LLM trainer.",
)


@DeveloperAPI
def get_model_type_jsonschema(model_type: str = MODEL_ECD):
Expand Down
Loading

0 comments on commit fddd82d

Please sign in to comment.