Skip to content

Commit

Permalink
fix: bug of saving preprocess cache files (#310)
Browse files Browse the repository at this point in the history
* save independent returns of preprocess_script() to 'others.pkl'

* fix CI
  • Loading branch information
XianBW authored Sep 24, 2024
1 parent dab2cff commit 5fb0608
Show file tree
Hide file tree
Showing 9 changed files with 22 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def preprocess_script():
y_train = pd.read_pickle("y_train.pkl")
y_valid = pd.read_pickle("y_valid.pkl")
X_test = pd.read_pickle("X_test.pkl")
ids = pd.read_pickle("ids.pkl")
others = pd.read_pickle("others.pkl")

return X_train, X_valid, y_train, y_valid, X_test, ids
return X_train, X_valid, y_train, y_valid, X_test, *others

X_train, X_valid, y_train, y_valid = prepreprocess()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,9 @@ def preprocess_script():
y_train = pd.read_pickle("/kaggle/preprocessed_data/y_train.pkl")
y_valid = pd.read_pickle("/kaggle/preprocessed_data/y_valid.pkl")
X_test = pd.read_pickle("/kaggle/preprocessed_data/X_test.pkl")
passenger_ids = pd.read_pickle("/kaggle/preprocessed_data/passenger_ids.pkl")
others = pd.read_pickle("/kaggle/preprocessed_data/others.pkl")

return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
return X_train, X_valid, y_train, y_valid, X_test, *others
X_train, X_valid, y_train, y_valid = prepreprocess()

# Fit the preprocessor on the training data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,9 @@ def preprocess_script():
y_train = pd.read_pickle("y_train.pkl")
y_valid = pd.read_pickle("y_valid.pkl")
X_test = pd.read_pickle("X_test.pkl")
passenger_ids = pd.read_pickle("passenger_ids.pkl")
others = pd.read_pickle("others.pkl")

return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
return X_train, X_valid, y_train, y_valid, X_test, *others
X_train, X_valid, y_train, y_valid = prepreprocess()

# Fit the preprocessor on the training data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,9 @@ def preprocess_script():
y_train = pd.read_pickle("y_train.pkl")
y_valid = pd.read_pickle("y_valid.pkl")
X_test = pd.read_pickle("X_test.pkl")
passenger_ids = pd.read_pickle("passenger_ids.pkl")
others = pd.read_pickle("others.pkl")

return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
return X_train, X_valid, y_train, y_valid, X_test, *others
X_train, X_valid, y_train, y_valid = prepreprocess()

# Fit the preprocessor on the training data
Expand Down
4 changes: 2 additions & 2 deletions rdagent/scenarios/kaggle/experiment/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def source_data(self) -> str:
y_train,
y_valid,
X_test,
passenger_ids,
*others,
) = preprocess_experiment.experiment_workspace.generate_preprocess_data()

data_folder.mkdir(exist_ok=True, parents=True)
Expand All @@ -122,7 +122,7 @@ def source_data(self) -> str:
pickle.dump(y_train, open(data_folder / "y_train.pkl", "wb"))
pickle.dump(y_valid, open(data_folder / "y_valid.pkl", "wb"))
pickle.dump(X_test, open(data_folder / "X_test.pkl", "wb"))
pickle.dump(passenger_ids, open(data_folder / "passenger_ids.pkl", "wb"))
pickle.dump(others, open(data_folder / "others.pkl", "wb"))

buffer = io.StringIO()
X_valid.info(verbose=True, buf=buffer, show_counts=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ def preprocess_script():
y_train = pd.read_pickle("y_train.pkl")
y_valid = pd.read_pickle("y_valid.pkl")
X_test = pd.read_pickle("X_test.pkl")
return X_train, X_valid, y_train, y_valid, X_test
others = pd.read_pickle("others.pkl")
return X_train, X_valid, y_train, y_valid, X_test, *others

X_train, X_valid, y_train, y_valid, test, category_encoder, test_ids = prepreprocess()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,9 @@ def preprocess_script():
y_train = pd.read_pickle("y_train.pkl")
y_valid = pd.read_pickle("y_valid.pkl")
X_test = pd.read_pickle("X_test.pkl")
passenger_ids = pd.read_pickle("passenger_ids.pkl")
others = pd.read_pickle("others.pkl")

return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
return X_train, X_valid, y_train, y_valid, X_test, *others
X_train, X_valid, y_train, y_valid = prepreprocess()

# Fit the preprocessor on the training data
Expand Down
13 changes: 7 additions & 6 deletions rdagent/scenarios/kaggle/experiment/workspace.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import subprocess
import zipfile
from pathlib import Path
from typing import Any

import pandas as pd

Expand All @@ -14,14 +15,14 @@
from fea_share_preprocess import preprocess_script
X_train, X_valid, y_train, y_valid, X_test, passenger_ids = preprocess_script()
X_train, X_valid, y_train, y_valid, X_test, *others = preprocess_script()
pickle.dump(X_train, open("X_train.pkl", "wb"))
pickle.dump(X_valid, open("X_valid.pkl", "wb"))
pickle.dump(y_train, open("y_train.pkl", "wb"))
pickle.dump(y_valid, open("y_valid.pkl", "wb"))
pickle.dump(X_test, open("X_test.pkl", "wb"))
pickle.dump(passenger_ids, open("passenger_ids.pkl", "wb"))
pickle.dump(others, open("others.pkl", "wb"))
"""


Expand All @@ -34,7 +35,7 @@ def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:

def generate_preprocess_data(
self,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, pd.Series]:
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, Any]:
kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
kgde.prepare()

Expand All @@ -47,7 +48,7 @@ def generate_preprocess_data(
"y_train.pkl",
"y_valid.pkl",
"X_test.pkl",
"passenger_ids.pkl",
"others.pkl",
],
running_extra_volume=(
{KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"}
Expand All @@ -59,8 +60,8 @@ def generate_preprocess_data(
logger.error("Feature preprocess failed.")
raise Exception("Feature preprocess failed.")
else:
X_train, X_valid, y_train, y_valid, X_test, passenger_ids = results
return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
X_train, X_valid, y_train, y_valid, X_test, others = results
return X_train, X_valid, y_train, y_valid, X_test, *others

def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
logger.info(f"Running the experiment in {self.workspace_path}")
Expand Down
1 change: 1 addition & 0 deletions rdagent/scenarios/kaggle/kaggle_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ def download_data(competition: str, local_path: str = "/data/userdata/share/kagg
"covid19-global-forecasting-week-1",
"birdsong-recognition",
"optiver-trading-at-the-close",
"facebook-v-predicting-check-ins",
]

for i in dsagent_cs + other_cs:
Expand Down

0 comments on commit 5fb0608

Please sign in to comment.