Skip to content

Commit

Permalink
Merge pull request #10 from WenjieDu/dev
Browse files Browse the repository at this point in the history
Add Solar Alabama processing pipeline and request tsdb>=0.5
  • Loading branch information
WenjieDu authored Jul 4, 2024
2 parents ecea3ca + d3d0bc5 commit d857eb6
Show file tree
Hide file tree
Showing 7 changed files with 153 additions and 4 deletions.
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,29 @@ To evaluate the performance of algorithms on POTS datasets, a benchmarking toolk
BenchPOTS provides the standard and unified preprocessing pipelines of a variety of POTS datasets.
It supports a variety of evaluation tasks to help users understand the performance of different algorithms.


## ❖ Usage Examples
> [!IMPORTANT]
> BenchPOTS is available on both <a alt='PyPI' href='https://pypi.python.org/pypi/benchpots'><img align='center' src='https://img.shields.io/badge/PyPI--lightgreen?style=social&logo=pypi'></a>
> and <a alt='Anaconda' href='https://anaconda.org/conda-forge/benchpots'><img align='center' src='https://img.shields.io/badge/Anaconda--lightgreen?style=social&logo=anaconda'></a>❗️
>
> Install via pip:
> > pip install benchpots
>
> or install from source code:
> > pip install `https://github.com/WenjieDu/BenchPOTS/archive/main.zip`
>
> or install via conda:
> > conda install benchpots -c conda-forge
```python
import benchpots

# Load PhysioNet2012 all three subsets and apply MCAR with 0.1 rate
benchpots.datasets.preprocess_physionet2012(subset="all", rate="0.1")

```

## ❖ Citing BenchPOTS/PyPOTS
The paper introducing PyPOTS is available [on arXiv](https://arxiv.org/abs/2305.18811),
A short version of it is accepted by the 9th SIGKDD international workshop on Mining and Learning from Time Series ([MiLeTS'23](https://kdd-milets.github.io/milets2023/))).
Expand Down
9 changes: 8 additions & 1 deletion benchpots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,11 @@
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
__version__ = "0.1.1"
__version__ = "0.2"

from . import utils, datasets

__all__ = [
"utils",
"datasets",
]
2 changes: 2 additions & 0 deletions benchpots/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .physionet_2012 import preprocess_physionet2012
from .physionet_2019 import preprocess_physionet2019
from .ucr_uea_datasets import preprocess_ucr_uea_datasets
from .solar_alabama import preprocess_solar_alabama

__all__ = [
"preprocess_physionet2012",
Expand All @@ -23,4 +24,5 @@
"preprocess_ett",
"preprocess_pems_traffic",
"preprocess_ucr_uea_datasets",
"preprocess_solar_alabama",
]
117 changes: 117 additions & 0 deletions benchpots/datasets/solar_alabama.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""
Preprocessing func for the dataset Solar Alabama.
"""

# Created by Wenjie Du <wenjay.du@gmail.com>
# License: BSD-3-Clause

import pandas as pd
import tsdb
from sklearn.preprocessing import StandardScaler

from ..utils.logging import logger, print_final_dataset_info
from ..utils.missingness import create_missingness
from ..utils.sliding import sliding_window


def preprocess_solar_alabama(
rate,
n_steps,
pattern: str = "point",
**kwargs,
) -> dict:
"""Load and preprocess the dataset Solar Alabama.
Parameters
----------
rate:
The missing rate.
n_steps:
The number of time steps to in the generated data samples.
Also the window size of the sliding window.
pattern:
The missing pattern to apply to the dataset.
Must be one of ['point', 'subseq', 'block'].
Returns
-------
processed_dataset :
A dictionary containing the processed Solar Alabama.
"""

assert 0 <= rate < 1, f"rate must be in [0, 1), but got {rate}"
assert n_steps > 0, f"sample_n_steps must be larger than 0, but got {n_steps}"

# read the raw data
data = tsdb.load("solar_alabama")
df = data["X"]

feature_names = df.columns.tolist()
feature_names.remove("date")
df["date"] = pd.to_datetime(df["date"])

unique_months = df["date"].dt.to_period("M").unique()
selected_as_train = unique_months[:6] # use the first 6 months as train set
logger.info(f"months selected as train set are {selected_as_train}")
selected_as_val = unique_months[6:9] # select the following 3 months as val set
logger.info(f"months selected as val set are {selected_as_val}")
selected_as_test = unique_months[9:] # select the left 3 months as test set
logger.info(f"months selected as test set are {selected_as_test}")

test_set = df[df["date"].dt.to_period("M").isin(selected_as_test)]
val_set = df[df["date"].dt.to_period("M").isin(selected_as_val)]
train_set = df[df["date"].dt.to_period("M").isin(selected_as_train)]

scaler = StandardScaler()
train_X = scaler.fit_transform(train_set.loc[:, feature_names])
val_X = scaler.transform(val_set.loc[:, feature_names])
test_X = scaler.transform(test_set.loc[:, feature_names])

train_X = sliding_window(train_X, n_steps)
val_X = sliding_window(val_X, n_steps)
test_X = sliding_window(test_X, n_steps)

# assemble the final processed data into a dictionary
processed_dataset = {
# general info
"n_steps": n_steps,
"n_features": train_X.shape[-1],
"scaler": scaler,
# train set
"train_X": train_X,
# val set
"val_X": val_X,
# test set
"test_X": test_X,
}

if rate > 0:
# hold out ground truth in the original data for evaluation
train_X_ori = train_X
val_X_ori = val_X
test_X_ori = test_X

# mask values in the train set to keep the same with below validation and test sets
train_X = create_missingness(train_X, rate, pattern, **kwargs)
# mask values in the validation set as ground truth
val_X = create_missingness(val_X, rate, pattern, **kwargs)
# mask values in the test set as ground truth
test_X = create_missingness(test_X, rate, pattern, **kwargs)

processed_dataset["train_X"] = train_X
processed_dataset["train_X_ori"] = train_X_ori

processed_dataset["val_X"] = val_X
processed_dataset["val_X_ori"] = val_X_ori

processed_dataset["test_X"] = test_X
# test_X_ori is for error calc, not for model input, hence mustn't have NaNs
processed_dataset["test_X_ori"] = test_X_ori
else:
logger.warning("rate is 0, no missing values are artificially added.")

print_final_dataset_info(train_X, val_X, test_X)
return processed_dataset
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ numpy
pandas
scikit-learn
torch >=1.10
tsdb >=0.4
tsdb >=0.5
pygrinder >=0.6
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ basic =
pandas
scikit-learn
torch >=1.10
tsdb >=0.4
tsdb >=0.5
pygrinder >=0.6
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"pandas",
"scikit-learn",
"torch >=1.10",
"tsdb >=0.4",
"tsdb >=0.5",
"pygrinder >=0.6",
],
python_requires=">=3.8.0",
Expand Down

0 comments on commit d857eb6

Please sign in to comment.