KEP-2170: Add unit and E2E tests for model and dataset initializers

Signed-off-by: wei-chenglai <qazwsx0939059006@gmail.com>
kubeflow · Nov 9, 2024 · f4167e5 · f4167e5
1 parent 95be3c0
commit f4167e5
Show file tree

Hide file tree

Showing 17 changed files with 776 additions and 1 deletion.
diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
@@ -97,6 +97,7 @@ jobs:
         run: |
           pip install pytest
           python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
+          pytest pkg/initializer_v2/test/e2e
         env:
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
 

diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml
@@ -32,4 +32,6 @@ jobs:
           pip install -U './sdk/python[huggingface]'
 
       - name: Run unit test for training sdk
-        run: pytest ./sdk/python/kubeflow/training/api/training_client_test.py
+        run: |
+          pytest ./sdk/python/kubeflow/training/api/training_client_test.py
+          pytest ./pkg/initializer_v2/test/unit
diff --git a/pkg/initializer_v2/test/__init__.py b/pkg/initializer_v2/test/__init__.py
diff --git a/pkg/initializer_v2/test/conftest.py b/pkg/initializer_v2/test/conftest.py
@@ -0,0 +1,52 @@
+import os
+import sys
+
+import pytest
+
+# Add project root to path if needed
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))
+
+
+@pytest.fixture
+def mock_env_vars():
+    """Fixture to set and clean up environment variables"""
+    original_env = dict(os.environ)
+
+    def _set_env_vars(**kwargs):
+        for key, value in kwargs.items():
+            if value is None:
+                os.environ.pop(key, None)
+            else:
+                os.environ[key] = str(value)
+        return os.environ
+
+    yield _set_env_vars
+
+    # Cleanup
+    os.environ.clear()
+    os.environ.update(original_env)
+
+
+@pytest.fixture
+def huggingface_model_instance():
+    """Fixture for HuggingFace Model instance"""
+    from pkg.initializer_v2.model.huggingface import HuggingFace
+
+    return HuggingFace()
+
+
+@pytest.fixture
+def huggingface_dataset_instance():
+    """Fixture for HuggingFace Dataset instance"""
+    from pkg.initializer_v2.dataset.huggingface import HuggingFace
+
+    return HuggingFace()
+
+
+@pytest.fixture
+def real_hf_token():
+    """Fixture to provide real HuggingFace token for E2E tests"""
+    token = os.getenv("HUGGINGFACE_TOKEN")
+    # if not token:
+    #     pytest.skip("HUGGINGFACE_TOKEN environment variable not set")
+    return token
diff --git a/pkg/initializer_v2/test/e2e/__init__.py b/pkg/initializer_v2/test/e2e/__init__.py
diff --git a/pkg/initializer_v2/test/e2e/test_dataset.py b/pkg/initializer_v2/test/e2e/test_dataset.py
@@ -0,0 +1,107 @@
+import os
+import runpy
+import shutil
+import tempfile
+
+import pytest
+
+import pkg.initializer_v2.utils.utils as utils
+from sdk.python.kubeflow.storage_initializer.constants import VOLUME_PATH_DATASET
+
+
+class TestDatasetE2E:
+    """E2E tests for dataset initialization"""
+
+    @pytest.fixture(autouse=True)
+    def setup_teardown(self, monkeypatch):
+        """Setup and teardown for each test"""
+        # Create temporary directory for dataset downloads
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        self.temp_dir = tempfile.mkdtemp(dir=current_dir)
+        os.environ[VOLUME_PATH_DATASET] = self.temp_dir
+
+        # Store original environment
+        self.original_env = dict(os.environ)
+
+        # Monkeypatch the constant in the module
+        import sdk.python.kubeflow.storage_initializer.constants as constants
+
+        monkeypatch.setattr(constants, "VOLUME_PATH_DATASET", self.temp_dir)
+
+        yield
+
+        # Cleanup
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+        os.environ.clear()
+        os.environ.update(self.original_env)
+
+    def verify_dataset_files(self, expected_files):
+        """Verify downloaded dataset files"""
+        if expected_files:
+            actual_files = set(os.listdir(self.temp_dir))
+            missing_files = set(expected_files) - actual_files
+            assert not missing_files, f"Missing expected files: {missing_files}"
+
+    @pytest.mark.parametrize(
+        "test_name, provider, test_case",
+        [
+            # Public HuggingFace dataset test
+            (
+                "HuggingFace - Public dataset",
+                "huggingface",
+                {
+                    "storage_uri": "hf://karpathy/tiny_shakespeare",
+                    "access_token": None,
+                    "expected_files": ["tiny_shakespeare.py"],
+                    "expected_error": None,
+                },
+            ),
+            # Private HuggingFace dataset test
+            # (
+            #     "HuggingFace - Private dataset",
+            #     "huggingface",
+            #     {
+            #         "storage_uri": "hf://username/private-dataset",
+            #         "use_real_token": True,
+            #         "expected_files": ["config.json", "dataset.safetensors"],
+            #         "expected_error": None
+            #     }
+            # ),
+            # Invalid HuggingFace dataset test
+            (
+                "HuggingFace - Invalid dataset",
+                "huggingface",
+                {
+                    "storage_uri": "hf://invalid/nonexistent-dataset",
+                    "access_token": None,
+                    "expected_files": None,
+                    "expected_error": Exception,
+                },
+            ),
+        ],
+    )
+    def test_dataset_download(self, test_name, provider, test_case, real_hf_token):
+        """Test end-to-end dataset download for different providers"""
+        print(f"\nRunning E2E test for {provider}: {test_name}")
+
+        # Setup environment variables based on test case
+        os.environ[utils.STORAGE_URI_ENV] = test_case["storage_uri"]
+        expected_files = test_case.get("expected_files")
+
+        # Handle token/credentials
+        if test_case.get("use_real_token"):
+            os.environ["ACCESS_TOKEN"] = real_hf_token
+        elif test_case.get("access_token"):
+            os.environ["ACCESS_TOKEN"] = test_case["access_token"]
+
+        # Run the main script
+        if test_case["expected_error"]:
+            with pytest.raises(test_case["expected_error"]):
+                runpy.run_module(
+                    "pkg.initializer_v2.dataset.__main__", run_name="__main__"
+                )
+        else:
+            runpy.run_module("pkg.initializer_v2.dataset.__main__", run_name="__main__")
+            self.verify_dataset_files(expected_files)
+
+        print("Test execution completed")
diff --git a/pkg/initializer_v2/test/e2e/test_model.py b/pkg/initializer_v2/test/e2e/test_model.py
@@ -0,0 +1,113 @@
+import os
+import runpy
+import shutil
+import tempfile
+
+import pytest
+
+import pkg.initializer_v2.utils.utils as utils
+from sdk.python.kubeflow.storage_initializer.constants import VOLUME_PATH_MODEL
+
+
+class TestModelE2E:
+    """E2E tests for model initialization"""
+
+    @pytest.fixture(autouse=True)
+    def setup_teardown(self, monkeypatch):
+        """Setup and teardown for each test"""
+        # Create temporary directory for model downloads
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        self.temp_dir = tempfile.mkdtemp(dir=current_dir)
+        print(self.temp_dir)
+        os.environ[VOLUME_PATH_MODEL] = self.temp_dir
+
+        # Store original environment
+        self.original_env = dict(os.environ)
+
+        # Monkeypatch the constant in the module
+        import sdk.python.kubeflow.storage_initializer.constants as constants
+
+        monkeypatch.setattr(constants, "VOLUME_PATH_MODEL", self.temp_dir)
+
+        yield
+
+        # Cleanup
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+        os.environ.clear()
+        os.environ.update(self.original_env)
+
+    def verify_model_files(self, expected_files):
+        """Verify downloaded model files"""
+        if expected_files:
+            actual_files = set(os.listdir(self.temp_dir))
+            missing_files = set(expected_files) - actual_files
+            assert not missing_files, f"Missing expected files: {missing_files}"
+
+    @pytest.mark.parametrize(
+        "test_name, provider, test_case",
+        [
+            # Public HuggingFace model test
+            (
+                "HuggingFace - Public model",
+                "huggingface",
+                {
+                    "storage_uri": "hf://hf-internal-testing/tiny-random-bert",
+                    "access_token": None,
+                    "expected_files": [
+                        "config.json",
+                        "model.safetensors",
+                        "tokenizer.json",
+                        "tokenizer_config.json",
+                    ],
+                    "expected_error": None,
+                },
+            ),
+            # Private HuggingFace model test
+            # (
+            #     "HuggingFace - Private model",
+            #     "huggingface",
+            #     {
+            #         "storage_uri": "hf://username/private-model",
+            #         "use_real_token": True,
+            #         "expected_files": ["config.json", "model.safetensors"],
+            #         "expected_error": None
+            #     }
+            # ),
+            # Invalid HuggingFace model test
+            (
+                "HuggingFace - Invalid model",
+                "huggingface",
+                {
+                    "storage_uri": "hf://invalid/nonexistent-model",
+                    "access_token": None,
+                    "expected_files": None,
+                    "expected_error": Exception,
+                },
+            ),
+        ],
+    )
+    def test_model_download(self, test_name, provider, test_case, real_hf_token):
+        """Test end-to-end model download for different providers"""
+        print(f"\nRunning E2E test for {provider}: {test_name}")
+
+        # Setup environment variables based on test case
+        os.environ[utils.STORAGE_URI_ENV] = test_case["storage_uri"]
+        expected_files = test_case.get("expected_files")
+
+        # Handle token/credentials
+        if test_case.get("use_real_token"):
+            os.environ["ACCESS_TOKEN"] = real_hf_token
+        elif test_case.get("access_token"):
+            os.environ["ACCESS_TOKEN"] = test_case["access_token"]
+
+        # Run the main script
+        if test_case["expected_error"]:
+            with pytest.raises(test_case["expected_error"]):
+                runpy.run_module(
+                    "pkg.initializer_v2.model.__main__", run_name="__main__"
+                )
+        else:
+            runpy.run_module("pkg.initializer_v2.model.__main__", run_name="__main__")
+            self.verify_model_files(expected_files)
+
+        print("Test execution completed")
diff --git a/pkg/initializer_v2/test/unit/__init__.py b/pkg/initializer_v2/test/unit/__init__.py
diff --git a/pkg/initializer_v2/test/unit/dataset/__init__.py b/pkg/initializer_v2/test/unit/dataset/__init__.py
diff --git a/pkg/initializer_v2/test/unit/dataset/test_dataset.py b/pkg/initializer_v2/test/unit/dataset/test_dataset.py
@@ -0,0 +1,86 @@
+import runpy
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+@pytest.mark.parametrize(
+    "test_name, test_case",
+    [
+        (
+            "Successful download with HuggingFace provider",
+            {
+                "storage_uri": "hf://dataset/path",
+                "access_token": "test_token",
+                "mock_config_error": False,
+                "expected_error": None,
+            },
+        ),
+        (
+            "Missing storage URI environment variable",
+            {
+                "storage_uri": None,
+                "access_token": None,
+                "mock_config_error": False,
+                "expected_error": Exception,
+            },
+        ),
+        (
+            "Invalid storage URI scheme",
+            {
+                "storage_uri": "invalid://dataset/path",
+                "access_token": None,
+                "mock_config_error": False,
+                "expected_error": Exception,
+            },
+        ),
+        (
+            "Config loading failure",
+            {
+                "storage_uri": "hf://dataset/path",
+                "access_token": None,
+                "mock_config_error": True,
+                "expected_error": Exception,
+            },
+        ),
+    ],
+)
+def test_dataset_main(test_name, test_case, mock_env_vars):
+    """Test main script with different scenarios"""
+    print(f"Running test: {test_name}")
+
+    # Setup mock environment variables
+    env_vars = {
+        "STORAGE_URI": test_case["storage_uri"],
+        "ACCESS_TOKEN": test_case["access_token"],
+    }
+    mock_env_vars(**env_vars)
+
+    # Setup mock HuggingFace instance
+    mock_hf_instance = MagicMock()
+    if test_case["mock_config_error"]:
+        mock_hf_instance.load_config.side_effect = Exception
+
+    with patch(
+        "pkg.initializer_v2.dataset.huggingface.HuggingFace",
+        return_value=mock_hf_instance,
+    ) as mock_hf:
+
+        # Execute test
+        if test_case["expected_error"]:
+            with pytest.raises(test_case["expected_error"]):
+                runpy.run_module(
+                    "pkg.initializer_v2.dataset.__main__", run_name="__main__"
+                )
+        else:
+            runpy.run_module("pkg.initializer_v2.dataset.__main__", run_name="__main__")
+
+            # Verify HuggingFace instance methods were called
+            mock_hf_instance.load_config.assert_called_once()
+            mock_hf_instance.download_dataset.assert_called_once()
+
+        # Verify HuggingFace class instantiation
+        if test_case["storage_uri"] and test_case["storage_uri"].startswith("hf://"):
+            mock_hf.assert_called_once()
+
+    print("Test execution completed")
diff --git a/pkg/initializer_v2/test/unit/dataset/test_dataset_config.py b/pkg/initializer_v2/test/unit/dataset/test_dataset_config.py
@@ -0,0 +1,16 @@
+from pkg.initializer_v2.dataset.config import HuggingFaceDatasetConfig
+
+
+def test_huggingface_dataset_config_creation():
+    """Test HuggingFaceModelInputConfig creation with different parameters"""
+    # Test with required parameters only
+    config = HuggingFaceDatasetConfig(storage_uri="hf://dataset/path")
+    assert config.storage_uri == "hf://dataset/path"
+    assert config.access_token is None
+
+    # Test with all parameters
+    config = HuggingFaceDatasetConfig(
+        storage_uri="hf://dataset/path", access_token="dummy_token"
+    )
+    assert config.storage_uri == "hf://dataset/path"
+    assert config.access_token == "dummy_token"