ludwig-ai · connor-mccorm · Sep 15, 2023 · Jun 29, 2023 · Jul 31, 2023 · Aug 23, 2023
@@ -0,0 +1,18 @@
+version: 1.0
+name: code_alpaca
+download_urls: https://raw.githubusercontent.com/sahil280114/codealpaca/master/data/code_alpaca_20k.json
+train_filenames: code_alpaca_20k.json
+loader: code_alpaca_loader.CodeAlpacaLoader
+description: |
+  This dataset, created by sahil280114, aims to build and share an instruction-following LLaMA model for code generation. The repo containing
+  this dataset is fully based on Stanford Alpaca, and only changes the data used for training.
+columns:
+  - name: instruction
+    type: text
+  - name: input
+    type: text
+  - name: output
+    type: text
+output_features:
+  - name: output
+    type: text
@@ -0,0 +1,50 @@
+version: 1.0
+name: consumer_complaints
+kaggle_dataset_id: selener/consumer-complaint-database
+archive_filenames: consumer-complaint-database.zip
+dataset_filenames: rows.csv
+loader: consumer_complaints_loader.ConsumerComplaintsLoader
+description: |
+  The dataset contains different information of complaints that customers have made about a multiple products and
+  services in the financial sector, such us Credit Reports, Student Loans, Money Transfer, etc. The date of each
+  complaint ranges from November 2011 to May 2019.
+columns:
+  - name: Date received
+    type: Date
+  - name: Product
+    type: text
+  - name: Sub-product
+    type: text
+  - name: Issue
+    type: text
+  - name: Sub-issue
+    type: text
+  - name: Consumer complaint narrative
+    type: text
+  - name: Company public response
+    type: text
+  - name: Company
+    type: text
+  - name: State
+    type: category
+  - name: ZIP code
+    type: category
+  - name: Tags
+    type: category
+  - name: Consumer consent provided?
+    type: text
+  - name: Submitted via
+    type: category
+  - name: Date sent to company
+    type: date
+  - name: Company response to consumer
+    type: text
+  - name: Timely response?
+    type: binary
+  - name: Consumer disputed?
+    type: binary
+  - name: Complaint ID
+    type: number
+output_features:
+  - name: Issue
+    type: text
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 Predibase, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import pandas as pd
+
+from ludwig.datasets.loaders.dataset_loader import DatasetLoader
+
+
+class CodeAlpacaLoader(DatasetLoader):
+    """The Code Alpaca dataset."""
+
+    def load_file_to_dataframe(self, file_path: str) -> pd.DataFrame:
+        """Loads a file into a dataframe."""
+        df = pd.read_json(file_path)
+        return df
@@ -0,0 +1,45 @@
+# Copyright (c) 2022 Predibase, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pandas as pd
+
+from ludwig.datasets.loaders.dataset_loader import DatasetLoader
+
+
+class ConsumerComplaintsLoader(DatasetLoader):
+    """The Consumer Complaints dataset."""
+
+    def load_file_to_dataframe(self, file_path: str) -> pd.DataFrame:
+        """Loads a file into a dataframe."""
+
+        consumer_complaints_df = pd.read_csv(file_path)
+        consumer_complaints_df = preprocess_df(consumer_complaints_df)
+
+        return consumer_complaints_df
+
+
+def preprocess_df(df):
+    """Preprocesses the dataframe.
+
+        - Remove all rows with missing values in the following columns:
+            - Consumer complaint narrative
+            - Issue
+            - Product
+
+    Args:
+        df (pd.DataFrame): The dataframe to preprocess.
+
+    Returns:
+        pd.DataFrame: The preprocessed dataframe.
+    """
+    return df.dropna(subset=["Consumer complaint narrative", "Issue", "Product"])
@@ -227,3 +227,15 @@ def test_dataset_fallback_mirror(dataset_name, shape):
 
     assert isinstance(dataset, pd.DataFrame)
     assert dataset.shape == shape
+
+
+@private_test
+@pytest.mark.parametrize("dataset_name, size", [("code_alpaca", 20000), ("consumer_complaints", 38000)])
+def test_ad_hoc_dataset_download(tmpdir, dataset_name, size):
+    dataset_config = ludwig.datasets._get_dataset_config(dataset_name)
+    assert isinstance(dataset_config, DatasetConfig)
+
+    ludwig_dataset = ludwig.datasets.get_dataset(dataset_name, cache_dir=tmpdir)
+    df = ludwig_dataset.load()
+    assert df is not None
+    assert len(df) >= size