added test cases for split() and preprocess()

UBC-MDS · Dec 16, 2024 · 771b2d9 · 771b2d9
1 parent f790482
commit 771b2d9
Show file tree

Hide file tree

Showing 3 changed files with 170 additions and 1 deletion.
diff --git a/test/test_preprocessing.py b/test/test_preprocessing.py
@@ -0,0 +1,91 @@
+# tests/test_preprocessing.py
+# References: Tiffany's Breast Cancer Predictor and ChatGPT were used as references
+
+import os
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+
+import pytest
+import pandas as pd
+import tempfile
+from src.preprocessing import preprocess
+
+# Tests for preprocess:
+# 1. With regular/normal df
+# 2. With empty df
+# 3. With invalid path 
+# 4. With invalid input types
+
+# note: split and preprocessing will have similar tests. 
+
+
+def test_preprocess_normal():
+    """
+    Tests the function with valid DataFrames and checks if the output files are created
+    """
+    # Create sample df
+    X_train = pd.DataFrame({
+        'num_feature': [1, 2, 3],
+        'cat_feature': ['A', 'B', 'A']
+    })
+    X_test = pd.DataFrame({
+        'num_feature': [4, 5],
+        'cat_feature': ['B', 'A']
+    })
+    numeric_features = ['num_feature']
+    categorical_features = ['cat_feature']
+
+    # Create a temporary directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        preprocess(X_train, X_test, numeric_features, categorical_features, temp_dir)
+
+        # Check if the processed files are created
+        assert os.path.exists(os.path.join(temp_dir, 'delay_preprocessor.pickle'))
+        assert os.path.exists(os.path.join(temp_dir, 'train_processed.pickle'))
+        assert os.path.exists(os.path.join(temp_dir, 'test_processed.pickle'))
+
+def test_preprocess_empty_dataframe():
+    """
+    Tests the function with empty DataFrames to ensure it raises a ValueError
+    """
+    X_train = pd.DataFrame()
+    X_test = pd.DataFrame()
+    numeric_features = ['num_feature']
+    categorical_features = ['cat_feature']
+
+    with pytest.raises(ValueError, match="DataFrame must contain observations."):
+        preprocess(X_train, X_test, numeric_features, categorical_features, 'some_path')
+
+def test_preprocess_invalid_path():
+    """
+    Tests the function with a non-existent directory to ensure it raises a FileNotFoundError
+    """
+    X_train = pd.DataFrame({
+        'num_feature': [1, 2, 3],
+        'cat_feature': ['A', 'B', 'A']
+    })
+    X_test = pd.DataFrame({
+        'num_feature': [4, 5],
+        'cat_feature': ['B', 'A']
+    })
+    numeric_features = ['num_feature']
+    categorical_features = ['cat_feature']
+
+    with pytest.raises(FileNotFoundError, match="Directory some_invalid_path does not exist."):
+        preprocess(X_train, X_test, numeric_features, categorical_features, 'some_invalid_path')
+
+def test_preprocess_invalid_input_type():
+    """
+    Tests the function with an invalid input type (string instead of df) 
+    to ensure it raises a TypeError
+    """
+    X_train = "not_a_dataframe"
+    X_test = pd.DataFrame({
+        'num_feature': [4, 5],
+        'cat_feature': ['B', 'A']
+    })
+    numeric_features = ['num_feature']
+    categorical_features = ['cat_feature']
+
+    with pytest.raises(TypeError, match="Input must be a pandas DataFrame"):
+        preprocess(X_train, X_test, numeric_features, categorical_features, 'some_path')
diff --git a/test/test_remove_outliers.py b/test/test_remove_outliers.py
@@ -1,4 +1,4 @@
-# src/tests/test_remove_outliers.py
+# tests/test_remove_outliers.py
 # References: Tiffany's Breast Cancer Predictor and ChatGPT were used as references
 
 

diff --git a/test/test_split.py b/test/test_split.py
@@ -0,0 +1,78 @@
+# tests/test_split.py
+# References: Tiffany's Breast Cancer Predictor and ChatGPT were used as references
+
+import os
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+
+import pytest
+import pandas as pd
+from src.split import split
+
+
+# Tests for split() function: 
+# 1. Can it split a regular dataframe with string and int values
+# 2. Test for empty dataset
+# 3. Raise error when dataframe not in an existing directory
+# 4. Raise error for invalid data type
+
+# note: split and preprocessing will have similar tests. 
+
+
+def test_split_normal_case(tmpdir):
+    """
+    Tests the function with a valid dataframe to checks if the output files 
+    are created in a temporary directory.
+    """
+    # Create a sample df
+    data = pd.DataFrame({
+        'feature1': [1, 2, 3, 4, 5],
+        'feature2': ['A', 'B', 'C', 'D', 'E'],
+        'target': [0, 1, 0, 1, 0]
+    })
+
+    # Use a temporary directory for saving files
+    directory = tmpdir.mkdir("test_dir")
+
+    # Call the split function
+    split(data, 'target', str(directory))
+
+    # Check if the files are created
+    assert os.path.exists(os.path.join(directory, 'X_train.csv'))
+    assert os.path.exists(os.path.join(directory, 'y_train.csv'))
+    assert os.path.exists(os.path.join(directory, 'X_test.csv'))
+    assert os.path.exists(os.path.join(directory, 'y_test.csv'))
+
+def test_split_empty_dataframe():
+    """
+    Tests the function with an empty dataframe to ensure it raises a ValueError.
+    """
+    # Create an empty df
+    data = pd.DataFrame()
+
+    # Check if ValueError is raised
+    with pytest.raises(ValueError, match="DataFrame must contain observations."):
+        split(data, 'target', 'some_directory')
+
+def test_split_nonexistent_directory():
+    """
+    Tests the function with a valid DataFrame but a non-existent directory to 
+    ensure it raises a FileNotFoundError.
+    """
+    # Create a sample df
+    data = pd.DataFrame({
+        'feature1': [1, 2, 3],
+        'target': [0, 1, 0]
+    })
+
+    # Check if FileNotFoundError is raised
+    with pytest.raises(FileNotFoundError, match="Directory nonexistent_directory does not exist."):
+        split(data, 'target', 'nonexistent_directory')
+
+def test_split_invalid_data_type():
+    """
+    Tests the function with a string instead of a DataFrame to ensure it raises a TypeError.
+    """
+    # Check if TypeError is raised for non-DataFrame input
+    with pytest.raises(TypeError, match="Input must be a pandas DataFrame"):
+        split("not_a_dataframe", 'target', 'some_directory')