diff --git a/test/test_preprocessing.py b/test/test_preprocessing.py new file mode 100644 index 0000000..8198294 --- /dev/null +++ b/test/test_preprocessing.py @@ -0,0 +1,91 @@ +# tests/test_preprocessing.py +# References: Tiffany's Breast Cancer Predictor and ChatGPT were used as references + +import os +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) + +import pytest +import pandas as pd +import tempfile +from src.preprocessing import preprocess + +# Tests for preprocess: +# 1. With regular/normal df +# 2. With empty df +# 3. With invalid path +# 4. With invalid input types + +# note: split and preprocessing will have similar tests. + + +def test_preprocess_normal(): + """ + Tests the function with valid DataFrames and checks if the output files are created + """ + # Create sample df + X_train = pd.DataFrame({ + 'num_feature': [1, 2, 3], + 'cat_feature': ['A', 'B', 'A'] + }) + X_test = pd.DataFrame({ + 'num_feature': [4, 5], + 'cat_feature': ['B', 'A'] + }) + numeric_features = ['num_feature'] + categorical_features = ['cat_feature'] + + # Create a temporary directory + with tempfile.TemporaryDirectory() as temp_dir: + preprocess(X_train, X_test, numeric_features, categorical_features, temp_dir) + + # Check if the processed files are created + assert os.path.exists(os.path.join(temp_dir, 'delay_preprocessor.pickle')) + assert os.path.exists(os.path.join(temp_dir, 'train_processed.pickle')) + assert os.path.exists(os.path.join(temp_dir, 'test_processed.pickle')) + +def test_preprocess_empty_dataframe(): + """ + Tests the function with empty DataFrames to ensure it raises a ValueError + """ + X_train = pd.DataFrame() + X_test = pd.DataFrame() + numeric_features = ['num_feature'] + categorical_features = ['cat_feature'] + + with pytest.raises(ValueError, match="DataFrame must contain observations."): + preprocess(X_train, X_test, numeric_features, categorical_features, 'some_path') + +def test_preprocess_invalid_path(): + """ + Tests the function with a non-existent directory to ensure it raises a FileNotFoundError + """ + X_train = pd.DataFrame({ + 'num_feature': [1, 2, 3], + 'cat_feature': ['A', 'B', 'A'] + }) + X_test = pd.DataFrame({ + 'num_feature': [4, 5], + 'cat_feature': ['B', 'A'] + }) + numeric_features = ['num_feature'] + categorical_features = ['cat_feature'] + + with pytest.raises(FileNotFoundError, match="Directory some_invalid_path does not exist."): + preprocess(X_train, X_test, numeric_features, categorical_features, 'some_invalid_path') + +def test_preprocess_invalid_input_type(): + """ + Tests the function with an invalid input type (string instead of df) + to ensure it raises a TypeError + """ + X_train = "not_a_dataframe" + X_test = pd.DataFrame({ + 'num_feature': [4, 5], + 'cat_feature': ['B', 'A'] + }) + numeric_features = ['num_feature'] + categorical_features = ['cat_feature'] + + with pytest.raises(TypeError, match="Input must be a pandas DataFrame"): + preprocess(X_train, X_test, numeric_features, categorical_features, 'some_path') \ No newline at end of file diff --git a/test/test_remove_outliers.py b/test/test_remove_outliers.py index 8b8a3d0..3ccc8f4 100644 --- a/test/test_remove_outliers.py +++ b/test/test_remove_outliers.py @@ -1,4 +1,4 @@ -# src/tests/test_remove_outliers.py +# tests/test_remove_outliers.py # References: Tiffany's Breast Cancer Predictor and ChatGPT were used as references diff --git a/test/test_split.py b/test/test_split.py new file mode 100644 index 0000000..3e23230 --- /dev/null +++ b/test/test_split.py @@ -0,0 +1,78 @@ +# tests/test_split.py +# References: Tiffany's Breast Cancer Predictor and ChatGPT were used as references + +import os +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) + +import pytest +import pandas as pd +from src.split import split + + +# Tests for split() function: +# 1. Can it split a regular dataframe with string and int values +# 2. Test for empty dataset +# 3. Raise error when dataframe not in an existing directory +# 4. Raise error for invalid data type + +# note: split and preprocessing will have similar tests. + + +def test_split_normal_case(tmpdir): + """ + Tests the function with a valid dataframe to checks if the output files + are created in a temporary directory. + """ + # Create a sample df + data = pd.DataFrame({ + 'feature1': [1, 2, 3, 4, 5], + 'feature2': ['A', 'B', 'C', 'D', 'E'], + 'target': [0, 1, 0, 1, 0] + }) + + # Use a temporary directory for saving files + directory = tmpdir.mkdir("test_dir") + + # Call the split function + split(data, 'target', str(directory)) + + # Check if the files are created + assert os.path.exists(os.path.join(directory, 'X_train.csv')) + assert os.path.exists(os.path.join(directory, 'y_train.csv')) + assert os.path.exists(os.path.join(directory, 'X_test.csv')) + assert os.path.exists(os.path.join(directory, 'y_test.csv')) + +def test_split_empty_dataframe(): + """ + Tests the function with an empty dataframe to ensure it raises a ValueError. + """ + # Create an empty df + data = pd.DataFrame() + + # Check if ValueError is raised + with pytest.raises(ValueError, match="DataFrame must contain observations."): + split(data, 'target', 'some_directory') + +def test_split_nonexistent_directory(): + """ + Tests the function with a valid DataFrame but a non-existent directory to + ensure it raises a FileNotFoundError. + """ + # Create a sample df + data = pd.DataFrame({ + 'feature1': [1, 2, 3], + 'target': [0, 1, 0] + }) + + # Check if FileNotFoundError is raised + with pytest.raises(FileNotFoundError, match="Directory nonexistent_directory does not exist."): + split(data, 'target', 'nonexistent_directory') + +def test_split_invalid_data_type(): + """ + Tests the function with a string instead of a DataFrame to ensure it raises a TypeError. + """ + # Check if TypeError is raised for non-DataFrame input + with pytest.raises(TypeError, match="Input must be a pandas DataFrame"): + split("not_a_dataframe", 'target', 'some_directory') \ No newline at end of file