Merge pull request #10 from WenjieDu/dev

Add Solar Alabama processing pipeline and request tsdb>=0.5
WenjieDu · Jul 4, 2024 · d857eb6 · d857eb6
2 parents ecea3ca + d3d0bc5
commit d857eb6
Show file tree

Hide file tree

Showing 7 changed files with 153 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -52,6 +52,29 @@ To evaluate the performance of algorithms on POTS datasets, a benchmarking toolk
 BenchPOTS provides the standard and unified preprocessing pipelines of a variety of POTS datasets.
 It supports a variety of evaluation tasks to help users understand the performance of different algorithms.
 
+
+## ❖ Usage Examples
+> [!IMPORTANT]
+> BenchPOTS is available on both <a alt='PyPI' href='https://pypi.python.org/pypi/benchpots'><img align='center' src='https://img.shields.io/badge/PyPI--lightgreen?style=social&logo=pypi'></a> 
+> and <a alt='Anaconda' href='https://anaconda.org/conda-forge/benchpots'><img align='center' src='https://img.shields.io/badge/Anaconda--lightgreen?style=social&logo=anaconda'></a>❗️
+> 
+> Install via pip:
+> > pip install benchpots
+> 
+> or install from source code:
+> > pip install `https://github.com/WenjieDu/BenchPOTS/archive/main.zip`
+>
+> or install via conda:
+> > conda install benchpots -c conda-forge
+
+```python
+import benchpots
+
+# Load PhysioNet2012 all three subsets and apply MCAR with 0.1 rate 
+benchpots.datasets.preprocess_physionet2012(subset="all", rate="0.1")
+
+```
+
 ## ❖ Citing BenchPOTS/PyPOTS
 The paper introducing PyPOTS is available [on arXiv](https://arxiv.org/abs/2305.18811),
 A short version of it is accepted by the 9th SIGKDD international workshop on Mining and Learning from Time Series ([MiLeTS'23](https://kdd-milets.github.io/milets2023/))).

diff --git a/benchpots/__init__.py b/benchpots/__init__.py
@@ -22,4 +22,11 @@
 #
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
-__version__ = "0.1.1"
+__version__ = "0.2"
+
+from . import utils, datasets
+
+__all__ = [
+    "utils",
+    "datasets",
+]
diff --git a/benchpots/datasets/__init__.py b/benchpots/datasets/__init__.py
@@ -13,6 +13,7 @@
 from .physionet_2012 import preprocess_physionet2012
 from .physionet_2019 import preprocess_physionet2019
 from .ucr_uea_datasets import preprocess_ucr_uea_datasets
+from .solar_alabama import preprocess_solar_alabama
 
 __all__ = [
     "preprocess_physionet2012",
@@ -23,4 +24,5 @@
     "preprocess_ett",
     "preprocess_pems_traffic",
     "preprocess_ucr_uea_datasets",
+    "preprocess_solar_alabama",
 ]
diff --git a/benchpots/datasets/solar_alabama.py b/benchpots/datasets/solar_alabama.py
@@ -0,0 +1,117 @@
+"""
+Preprocessing func for the dataset Solar Alabama.
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+import pandas as pd
+import tsdb
+from sklearn.preprocessing import StandardScaler
+
+from ..utils.logging import logger, print_final_dataset_info
+from ..utils.missingness import create_missingness
+from ..utils.sliding import sliding_window
+
+
+def preprocess_solar_alabama(
+    rate,
+    n_steps,
+    pattern: str = "point",
+    **kwargs,
+) -> dict:
+    """Load and preprocess the dataset Solar Alabama.
+
+    Parameters
+    ----------
+    rate:
+        The missing rate.
+
+    n_steps:
+        The number of time steps to in the generated data samples.
+        Also the window size of the sliding window.
+
+    pattern:
+        The missing pattern to apply to the dataset.
+        Must be one of ['point', 'subseq', 'block'].
+
+    Returns
+    -------
+    processed_dataset :
+        A dictionary containing the processed Solar Alabama.
+    """
+
+    assert 0 <= rate < 1, f"rate must be in [0, 1), but got {rate}"
+    assert n_steps > 0, f"sample_n_steps must be larger than 0, but got {n_steps}"
+
+    # read the raw data
+    data = tsdb.load("solar_alabama")
+    df = data["X"]
+
+    feature_names = df.columns.tolist()
+    feature_names.remove("date")
+    df["date"] = pd.to_datetime(df["date"])
+
+    unique_months = df["date"].dt.to_period("M").unique()
+    selected_as_train = unique_months[:6]  # use the first 6 months as train set
+    logger.info(f"months selected as train set are {selected_as_train}")
+    selected_as_val = unique_months[6:9]  # select the following 3 months as val set
+    logger.info(f"months selected as val set are {selected_as_val}")
+    selected_as_test = unique_months[9:]  # select the left 3 months as test set
+    logger.info(f"months selected as test set are {selected_as_test}")
+
+    test_set = df[df["date"].dt.to_period("M").isin(selected_as_test)]
+    val_set = df[df["date"].dt.to_period("M").isin(selected_as_val)]
+    train_set = df[df["date"].dt.to_period("M").isin(selected_as_train)]
+
+    scaler = StandardScaler()
+    train_X = scaler.fit_transform(train_set.loc[:, feature_names])
+    val_X = scaler.transform(val_set.loc[:, feature_names])
+    test_X = scaler.transform(test_set.loc[:, feature_names])
+
+    train_X = sliding_window(train_X, n_steps)
+    val_X = sliding_window(val_X, n_steps)
+    test_X = sliding_window(test_X, n_steps)
+
+    # assemble the final processed data into a dictionary
+    processed_dataset = {
+        # general info
+        "n_steps": n_steps,
+        "n_features": train_X.shape[-1],
+        "scaler": scaler,
+        # train set
+        "train_X": train_X,
+        # val set
+        "val_X": val_X,
+        # test set
+        "test_X": test_X,
+    }
+
+    if rate > 0:
+        # hold out ground truth in the original data for evaluation
+        train_X_ori = train_X
+        val_X_ori = val_X
+        test_X_ori = test_X
+
+        # mask values in the train set to keep the same with below validation and test sets
+        train_X = create_missingness(train_X, rate, pattern, **kwargs)
+        # mask values in the validation set as ground truth
+        val_X = create_missingness(val_X, rate, pattern, **kwargs)
+        # mask values in the test set as ground truth
+        test_X = create_missingness(test_X, rate, pattern, **kwargs)
+
+        processed_dataset["train_X"] = train_X
+        processed_dataset["train_X_ori"] = train_X_ori
+
+        processed_dataset["val_X"] = val_X
+        processed_dataset["val_X_ori"] = val_X_ori
+
+        processed_dataset["test_X"] = test_X
+        # test_X_ori is for error calc, not for model input, hence mustn't have NaNs
+        processed_dataset["test_X_ori"] = test_X_ori
+    else:
+        logger.warning("rate is 0, no missing values are artificially added.")
+
+    print_final_dataset_info(train_X, val_X, test_X)
+    return processed_dataset
diff --git a/requirements.txt b/requirements.txt
@@ -3,5 +3,5 @@ numpy
 pandas
 scikit-learn
 torch >=1.10
-tsdb >=0.4
+tsdb >=0.5
 pygrinder >=0.6
diff --git a/setup.cfg b/setup.cfg
@@ -28,5 +28,5 @@ basic =
     pandas
     scikit-learn
     torch >=1.10
-    tsdb >=0.4
+    tsdb >=0.5
     pygrinder >=0.6
diff --git a/setup.py b/setup.py
@@ -50,7 +50,7 @@
         "pandas",
         "scikit-learn",
         "torch >=1.10",
-        "tsdb >=0.4",
+        "tsdb >=0.5",
         "pygrinder >=0.6",
     ],
     python_requires=">=3.8.0",