Merge branch 'master' of github.com:ludwig-ai/ludwig into local_path_fix

ludwig-ai · Sep 15, 2023 · fddd82d · fddd82d
2 parents a4f5adc + c6964f0
commit fddd82d
Show file tree

Hide file tree

Showing 38 changed files with 555 additions and 64 deletions.
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -78,7 +78,7 @@ jobs:
       - name: Setup Linux
         if: runner.os == 'linux'
         run: |
-          sudo apt-get install -y cmake libsndfile1 wget
+          sudo apt-get update && sudo apt-get install -y cmake libsndfile1 wget
 
       - name: Setup macOS
         if: runner.os == 'macOS'
@@ -230,7 +230,7 @@ jobs:
       - name: Setup Linux
         if: runner.os == 'linux'
         run: |
-          sudo apt-get install -y cmake libsndfile1
+          sudo apt-get update && sudo apt-get install -y cmake libsndfile1
 
       - name: Setup macOS
         if: runner.os == 'macOS'
@@ -287,7 +287,7 @@ jobs:
       - name: Setup Linux
         if: runner.os == 'linux'
         run: |
-          sudo apt-get install -y cmake libsndfile1
+          sudo apt-get update && sudo apt-get install -y cmake libsndfile1
 
       - name: Setup macOS
         if: runner.os == 'macOS'
@@ -344,7 +344,7 @@ jobs:
       - name: Setup Linux
         if: runner.os == 'linux'
         run: |
-          sudo apt-get install -y cmake libsndfile1
+          sudo apt-get update && sudo apt-get install -y cmake libsndfile1
 
       - name: Setup macOS
         if: runner.os == 'macOS'
@@ -401,7 +401,7 @@ jobs:
       - name: Setup Linux
         if: runner.os == 'linux'
         run: |
-          sudo apt-get install -y cmake libsndfile1
+          sudo apt-get update && sudo apt-get install -y cmake libsndfile1
 
       - name: Setup macOS
         if: runner.os == 'macOS'
@@ -425,7 +425,64 @@ jobs:
 
       - name: Integration Tests (D)
         run: |
-          RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=7200 pytest -v --timeout 300 --durations 100 -m "not slow and not combinatorial and not horovod and not llm and not integration_tests_a and not integration_tests_b and not integration_tests_c" --junitxml pytest.xml tests/integration_tests
+          RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=7200 pytest -v --timeout 300 --durations 100 -m "not slow and not combinatorial and not horovod and not llm and integration_tests_d" --junitxml pytest.xml tests/integration_tests
+
+  integration-tests-e:
+    name: Integration Tests (E)
+    runs-on: ubuntu-latest
+
+    env:
+      AWS_ACCESS_KEY_ID: ${{ secrets.LUDWIG_TESTS_AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.LUDWIG_TESTS_AWS_SECRET_ACCESS_KEY }}
+      KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
+      KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
+      IS_NOT_FORK: ${{ !(github.event.pull_request.base.repo.full_name == 'ludwig-ai/ludwig' && github.event.pull_request.head.repo.fork) }}
+
+    services:
+      minio:
+        image: fclairamb/minio-github-actions
+        env:
+          MINIO_ACCESS_KEY: minio
+          MINIO_SECRET_KEY: minio123
+        ports:
+          - 9000:9000
+
+    timeout-minutes: 90
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.9
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+
+      - name: Setup Linux
+        if: runner.os == 'linux'
+        run: |
+          sudo apt-get update && sudo apt-get install -y cmake libsndfile1
+
+      - name: Setup macOS
+        if: runner.os == 'macOS'
+        run: |
+          brew install libuv
+
+      - name: Install dependencies
+        run: |
+          python --version
+          pip --version
+          python -m pip install -U pip
+
+          # remove torch and ray from the dependencies so we can add them depending on the matrix args for the job.
+          cat requirements.txt | sed '/^torch[>=<\b]/d' | sed '/^torchtext/d' | sed '/^torchvision/d' | sed '/^torchaudio/d' > requirements-temp && mv requirements-temp requirements.txt
+          cat requirements_distributed.txt | sed '/^ray[\[]/d'
+          pip install torch==2.0.0 torchtext torchvision torchaudio
+          pip install ray==2.3.0
+          pip install '.[test]'
+          pip list
+        shell: bash
+
+      - name: Integration Tests (E)
+        run: |
+          RUN_PRIVATE=$IS_NOT_FORK LUDWIG_TEST_SUITE_TIMEOUT_S=7200 pytest -v --timeout 300 --durations 100 -m "not slow and not combinatorial and not horovod and not llm and not integration_tests_a and not integration_tests_b and not integration_tests_c and not integration_tests_d" --junitxml pytest.xml tests/integration_tests
 
   llm-tests:
     name: LLM Tests
@@ -442,7 +499,7 @@ jobs:
       - name: Setup Linux
         if: runner.os == 'linux'
         run: |
-          sudo apt-get install -y cmake libsndfile1
+          sudo apt-get update && sudo apt-get install -y cmake libsndfile1
 
       - name: Setup macOS
         if: runner.os == 'macOS'
@@ -483,7 +540,7 @@ jobs:
       - name: Setup Linux
         if: runner.os == 'linux'
         run: |
-          sudo apt-get install -y cmake libsndfile1
+          sudo apt-get update && sudo apt-get install -y cmake libsndfile1
 
       - name: Setup macOS
         if: runner.os == 'macOS'
@@ -522,7 +579,7 @@ jobs:
       - name: Setup Linux
         if: runner.os == 'linux'
         run: |
-          sudo apt-get install -y cmake libsndfile1
+          sudo apt-get update && sudo apt-get install -y cmake libsndfile1
 
       - name: Setup macOS
         if: runner.os == 'macOS'

diff --git a/ludwig/api.py b/ludwig/api.py
@@ -808,6 +808,17 @@ def train_online(
         self.model = self._online_trainer.train_online(training_dataset)
 
     def _tune_batch_size(self, trainer, dataset, random_seed: int = default_random_seed):
+        """Sets AUTO batch-size-related parameters based on the trainer, backend type, and number of workers.
+
+        Batch-size related parameters that are set:
+        - trainer.batch_size
+        - trainer.eval_batch_size
+        - trainer.gradient_accumulation_steps
+        - trainer.effective_batch_size
+
+        The final batch size selected may be non-deterministic even with a fixed random seed since throughput-based
+        heuristics may be affected by resources used by other processes running on the machine.
+        """
         if not self.config_obj.trainer.can_tune_batch_size():
             # Models like GBMs don't have batch sizes to be tuned
             return

diff --git a/ludwig/datasets/configs/code_alpaca.yaml b/ludwig/datasets/configs/code_alpaca.yaml
@@ -0,0 +1,18 @@
+version: 1.0
+name: code_alpaca
+download_urls: https://raw.githubusercontent.com/sahil280114/codealpaca/master/data/code_alpaca_20k.json
+train_filenames: code_alpaca_20k.json
+loader: code_alpaca_loader.CodeAlpacaLoader
+description: |
+  This dataset, created by sahil280114, aims to build and share an instruction-following LLaMA model for code generation. The repo containing
+  this dataset is fully based on Stanford Alpaca, and only changes the data used for training.
+columns:
+  - name: instruction
+    type: text
+  - name: input
+    type: text
+  - name: output
+    type: text
+output_features:
+  - name: output
+    type: text
diff --git a/ludwig/datasets/configs/consumer_complaints.yaml b/ludwig/datasets/configs/consumer_complaints.yaml
@@ -0,0 +1,50 @@
+version: 1.0
+name: consumer_complaints
+kaggle_dataset_id: selener/consumer-complaint-database
+archive_filenames: consumer-complaint-database.zip
+dataset_filenames: rows.csv
+loader: consumer_complaints_loader.ConsumerComplaintsLoader
+description: |
+  The dataset contains different information of complaints that customers have made about a multiple products and
+  services in the financial sector, such us Credit Reports, Student Loans, Money Transfer, etc. The date of each
+  complaint ranges from November 2011 to May 2019.
+columns:
+  - name: Date received
+    type: Date
+  - name: Product
+    type: text
+  - name: Sub-product
+    type: text
+  - name: Issue
+    type: text
+  - name: Sub-issue
+    type: text
+  - name: Consumer complaint narrative
+    type: text
+  - name: Company public response
+    type: text
+  - name: Company
+    type: text
+  - name: State
+    type: category
+  - name: ZIP code
+    type: category
+  - name: Tags
+    type: category
+  - name: Consumer consent provided?
+    type: text
+  - name: Submitted via
+    type: category
+  - name: Date sent to company
+    type: date
+  - name: Company response to consumer
+    type: text
+  - name: Timely response?
+    type: binary
+  - name: Consumer disputed?
+    type: binary
+  - name: Complaint ID
+    type: number
+output_features:
+  - name: Issue
+    type: text
diff --git a/ludwig/datasets/loaders/code_alpaca_loader.py b/ludwig/datasets/loaders/code_alpaca_loader.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 Predibase, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import pandas as pd
+
+from ludwig.datasets.loaders.dataset_loader import DatasetLoader
+
+
+class CodeAlpacaLoader(DatasetLoader):
+    """The Code Alpaca dataset."""
+
+    def load_file_to_dataframe(self, file_path: str) -> pd.DataFrame:
+        """Loads a file into a dataframe."""
+        df = pd.read_json(file_path)
+        return df
diff --git a/ludwig/datasets/loaders/consumer_complaints_loader.py b/ludwig/datasets/loaders/consumer_complaints_loader.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2022 Predibase, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pandas as pd
+
+from ludwig.datasets.loaders.dataset_loader import DatasetLoader
+
+
+class ConsumerComplaintsLoader(DatasetLoader):
+    """The Consumer Complaints dataset."""
+
+    def load_file_to_dataframe(self, file_path: str) -> pd.DataFrame:
+        """Loads a file into a dataframe."""
+
+        consumer_complaints_df = pd.read_csv(file_path)
+        consumer_complaints_df = preprocess_df(consumer_complaints_df)
+
+        return consumer_complaints_df
+
+
+def preprocess_df(df):
+    """Preprocesses the dataframe.
+
+        - Remove all rows with missing values in the following columns:
+            - Consumer complaint narrative
+            - Issue
+            - Product
+
+    Args:
+        df (pd.DataFrame): The dataframe to preprocess.
+
+    Returns:
+        pd.DataFrame: The preprocessed dataframe.
+    """
+    return df.dropna(subset=["Consumer complaint narrative", "Issue", "Product"])
diff --git a/ludwig/models/llm.py b/ludwig/models/llm.py
@@ -384,6 +384,7 @@ def forward(
         with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False) if (
             torch.cuda.is_available() and self.curr_device.type == "cuda"
         ) else contextlib.nullcontext():
+            # TODO (jeffkinnison): Determine why the 8-bit `SCB` and `CB` matrices are deleted in the forward pass
             model_outputs = self.model(input_ids=self.model_inputs, attention_mask=self.attention_masks).get(LOGITS)
 
         if self.output_feature_type != TEXT:

diff --git a/ludwig/modules/metric_modules.py b/ludwig/modules/metric_modules.py
@@ -410,7 +410,7 @@ def __init__(self, **kwargs):
         super().__init__()
 
 
-@register_metric("char_error_rate", [SEQUENCE, TEXT], MINIMIZE, PREDICTIONS)
+@register_metric("char_error_rate", [TEXT], MINIMIZE, RESPONSE)
 class CharErrorRateMetric(CharErrorRate, LudwigMetric):
     def __init__(self, **kwargs):
         super().__init__()

diff --git a/ludwig/schema/metadata/configs/trainer.yaml b/ludwig/schema/metadata/configs/trainer.yaml
@@ -429,6 +429,18 @@ ecd:
             Suggested to enable this if training is proceeding very slowly in distributed training (and GPU
             utilization is low), or the batch size is very small and the loss curves look very spiky.
         ui_display_name: Gradient Accumulation Steps
+    enable_gradient_checkpointing:
+        expected_impact: 2
+        ui_display_name: Enable Gradient Checkpointing
+        default_value_reasoning:
+            Gradient checkpointing is a technique to reduce the memory footprint of the model by
+            trading compute for memory. This is useful when training very large models that run into out of memory
+            errors very quickly during training. It is particularly helpful when doing non-quantization based training
+            (adapter based or full fine-tuning). Gradient checkpointing works by recomputing the activations of the
+            model during the backward pass, rather than storing them in memory during the forward pass.
+            This is a tradeoff between compute and memory, as the activations need to be recomputed during
+            the backward pass, but the memory footprint is reduced. This is set to false by default because
+            it is not always beneficial to use gradient checkpointing, and it can sometimes slow down training.
     validation_field:
         default_value_reasoning:
             Concrete evaluation metrics are usually better than loss,

diff --git a/ludwig/schema/trainer.py b/ludwig/schema/trainer.py
@@ -409,6 +409,13 @@ def __post_init__(self):
         parameter_metadata=TRAINER_METADATA[MODEL_ECD]["compile"],
     )
 
+    enable_gradient_checkpointing: bool = schema_utils.Boolean(
+        default=False,
+        description="Whether to enable gradient checkpointing, which trades compute for memory."
+        "This is useful for training very deep models with limited memory.",
+        parameter_metadata=TRAINER_METADATA[MODEL_ECD]["enable_gradient_checkpointing"],
+    )
+
     def update_batch_size_grad_accum(self, num_workers: int):
         from ludwig.utils.trainer_utils import get_rendered_batch_size_grad_accum
 
@@ -881,6 +888,11 @@ class FineTuneTrainerConfig(ECDTrainerConfig):
         description="Base learning rate used for training in the LLM trainer.",
     )
 
+    eval_batch_size: int = schema_utils.PositiveInteger(
+        default=2,
+        description="Batch size used for evaluation in the LLM trainer.",
+    )
+
 
 @DeveloperAPI
 def get_model_type_jsonschema(model_type: str = MODEL_ECD):