Merge dev into main

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
NVIDIA · Sep 10, 2024 · 9613aa1 · 9613aa1
2 parents 7b01710 + 71601e1
commit 9613aa1
Show file tree

Hide file tree

Showing 23 changed files with 1,325 additions and 6 deletions.
diff --git a/scripts/header-check.sh b/scripts/header-check.sh
@@ -26,9 +26,10 @@ EXCLUDE_PATTERNS=(
     "core/src/main/resources/*"
     "core/src/test/resources/*"
     "user_tools/src/spark_rapids_pytools/resources/*"
-    "user_tools/docs/resources/*"
+    "user_tools/docs/*"
     "user_tools/tests/spark_rapids_tools_ut/resources/*"
     "*.csv"
+    "*.zstd"
     )
 
 # Create the grep exclude options (--exclude=*csv --exclude=core/src/test/resources/*)

diff --git a/user_tools/docs/qualx.md b/user_tools/docs/qualx.md
@@ -135,6 +135,7 @@ spark_rapids train \
 
 Once satisfied with the model, just supply the path to this model in the `--custom_model_file` argument for prediction.
 
+### Training (Advanced)
 #### Fine-tuning / Incremental Training
 
 To continue training an existing pre-trained model on new data, just set up the new dataset per above and then
@@ -178,3 +179,42 @@ df.to_csv('features/features_with_label.csv', index=False)
 Then, train a custom model with the `--features_csv_dir features` argument.
 
 Once satisfied with the model, just supply the path to this model in the `--custom_model_file` argument for prediction.
+
+#### Dataset-specific Plugins
+
+In certain situations, a dataset may require custom handling.  For these cases, we provide a plugin mechanism
+for custom code that can be attached to that dataset.  The plugin implementation is just a python file that defines
+any of the following functions:
+```python
+import pandas as pd
+
+def load_profiles_hook(profile_df: pd.DataFrame) -> pd.DataFrame:
+    """Custom post processing on the load_profiles dataframe."""
+    # Insert custom code to modify the profile_df as needed.
+    # Note: profile_df contains "raw" features extracted from the Profiler tool's output CSV files.
+    return profile_df
+
+
+def split_function(cpu_aug_tbl: pd.DataFrame) -> pd.DataFrame:
+    """Custom train/test/val split function."""
+    # Insert custom code to set cpu_aug_tbl['split'] to 'train', 'test', or 'val'.
+    # Note: the default split function randomly splits the data by ratios of 60/20/20.
+    return cpu_aug_tbl
+```
+
+In order to use a custom plugin, just reference it in the associated dataset JSON file:
+```
+# datasets/onprem/my_custom_dataset.json
+{
+    "my_custom_dataset": {
+        "eventlogs": [
+            "/path/to/eventlogs"
+        ],
+        "app_meta": {
+            ...
+        },
+        "load_profiles_hook": "/path/to/custom_plugin.py",
+        "split_function": "/path/to/custom_plugin.py"
+    }
+}
+```
diff --git a/user_tools/docs/resources/debug-behave-intellij.png b/user_tools/docs/resources/debug-behave-intellij.png
diff --git a/user_tools/docs/tools_e2e_tests.md b/user_tools/docs/tools_e2e_tests.md
@@ -0,0 +1,172 @@
+# Spark Rapids Tools End-to-End Behavior Tests
+
+This document outlines the end-to-end tests for Spark Rapids tools, designed to cover scenarios such as missing
+dependencies, handling different types of event logs, and interacting with HDFS.
+
+## Directory Structure
+```commandline
+user_tools/tests/spark_rapids_tools_e2e/
+├── features                        # Contains test scenarios and environment setup.
+│  ├── environment.py               # Setup and teardown procedures for the tests.
+│  ├── steps                        # Step definitions for the tests.
+│  └── *.feature                    # Feature files defining test scenarios.
+└── resources                       # Resources used in the tests.
+    ├── event_logs  
+    └── scripts                     # Scripts used in the tests.  
+```
+Configurations for `behave` tests are defined in `user_tools/tox.ini` file.
+
+## Setup
+
+From the `<repo_root>/user_tools` directory, run the following command to install the required dependencies:
+
+
+```sh
+pip install behave
+# or
+pip install .[test]
+```
+
+
+## Running Tests
+Tests can be run using 'behave' cmd or using 'tox' cmd.
+
+**Basic Usage:**
+
+```sh
+behave <options>
+# or
+tox -e behave -- <options>
+```
+
+**Run All Tests:**
+
+```sh
+behave
+# or
+tox -e behave
+```
+
+### Common Options
+
+**Run Specific Tests by Tag**
+
+See the [Tags Format](#tags-format) section for more information on tags.
+
+```sh
+behave --tags <tag>
+# or
+tox -e behave -- --tags <tag>
+```
+
+**Run Specific Tests by Name**
+
+```sh
+behave --name <scenario_name>
+# or
+tox -e behave -- --name <scenario_name>
+```
+
+**Skip Tests by Tag**
+
+```sh
+behave --tags ~<tag>
+# or
+tox -e behave -- --tags ~<tag>
+```
+
+**Custom Arguments**
+- Custom arguments can be passed to the behave tests using the `-D` flag.
+- Example: Skip building the Tools jar during setup.
+
+```sh
+behave -D build_jar=false   # Skip building the Tools jar during setup (default: true)
+# or
+tox -e behave -- -D build_jar=false
+```
+
+**Verbose Mode**
+- When verbose mode is enabled, `STDOUT` and `STDERR` from all subprocesses executed during the test run are shown in the console.
+```sh
+behave -v
+# or
+tox -e behave -- -v
+```
+
+## Notes
+
+### Tags Format
+Tags are used to uniquely identify test scenarios and are defined in the following format: `@test_id_<feature>_<00xx>`.
+- `<feature>`: Acronym for the feature file being tested. Examples:
+   - `ELP` for `event_log_processing.feature`
+   - `IC` for `installation_checks.feature`
+- `<00xx>`: Unique 4-digit identifier for the test scenario. Examples: `0001`, `0002`.
+
+Tags Example: `@test_id_ELP_0001`, `@test_id_IC_0002`.
+
+### Built-in Setup Steps
+
+The tests include the following setup steps:
+
+1. Build Spark Rapids Tools JAR:
+    - By default, the JAR is built before running the tests.
+    - To skip this step (e.g., if the JAR is already built), use the argument -D build_jar=false.
+2. Build the Python Package.
+
+The test warns the user that initial setup may take a few minutes.
+
+### Built-in HDFS Cluster Setup
+
+- Some of the tests include configuring a local HDFS cluster. Step: `HDFS is "{status}"`
+- This step downloads Hadoop binaries and sets up the cluster.
+  - The download occurs only once per machine but cluster setup is done for each test run.
+  - Download step may take a few minutes.
+- Tests involving HDFS are tagged with `@long_running` and can be skipped using `--tags ~@long_running`
+
+#### HDFS Configuration:
+- Replication factor: 1
+- Disk Space Quota: 2GB
+- Temp Directory: `/tmp/spark_rapids_tools_e2e_tests`
+   - Temp Directory can be changed using the argument `-D e2e_tests_tmp_dir=<dir_path>` during test run.
+
+#### Cleanup
+- Step `HDFS is "{status}"` sets an after scenario hook to stop up the HDFS cluster and remove the temporary directories.
+- It does not clean up the Hadoop binaries downloaded during the setup.
+- Cleanup can be done manually using the below script:
+```sh
+<repo_root>/user_tools/tests/spark_rapids_tools_e2e/resources/scripts/hdfs/cleanup_hdfs.sh
+```  
+
+## Debugging Tests in IDE:
+
+- Ensure the Python interpreter is set to the correct virtual environment and `JAVA_HOME` is set.
+
+**IntelliJ**
+- Add a Python run configuration with module name: `behave` and working directory: `<repo-root>/user_tools`.
+- Add required arguments in `Script parameters` field.
+
+Sample Run Configuration:
+![resources/debug-behave-intellij.png](resources/debug-behave-intellij.png)
+
+**VS Code**
+- Open or create the `.vscode/launch.json` file. Add the following configuration with required arguments:
+```json
+{
+    "configurations": [
+        {
+            "name": "Python: Spark Rapids Tools E2E Tests",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "behave",
+            "args": [],  
+            "python": "${command:python.interpreterPath}",
+            "cwd": "${workspaceFolder}/user_tools"
+        }
+    ]
+}
+```
+
+
+## Guidelines for Writing Tests
+
+TODO: Add guidelines and conventions for writing tests.
diff --git a/user_tools/pyproject.toml b/user_tools/pyproject.toml
@@ -76,7 +76,7 @@ version = {attr = "spark_rapids_pytools.__version__"}
 repository = "https://github.com/NVIDIA/spark-rapids-tools/tree/main"
 [project.optional-dependencies]
 test = [
-    "tox", 'pytest', 'cli_test_helpers'
+    "tox", 'pytest', 'cli_test_helpers', 'behave'
 ]
 qualx = [
     "holoviews",

diff --git a/user_tools/tests/spark_rapids_tools_e2e/features/environment.py b/user_tools/tests/spark_rapids_tools_e2e/features/environment.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This module defines environment setup and teardown functions for the end-to-end tests using behave.
+"""
+
+import os
+import shutil
+import tempfile
+
+from spark_rapids_tools.utils import Utilities
+from steps.e2e_utils import E2ETestUtils
+
+""" Define behave hooks for the tests. These hooks are automatically called by behave. """
+
+logger = E2ETestUtils.get_logger()
+
+
+def before_all(context) -> None:
+    """
+    Set up the environment for the tests. This function is automatically called before all the tests.
+    """
+    context.temp_dir = tempfile.mkdtemp()
+    _set_environment_variables(context)
+    _set_verbose_mode(context)
+    _setup_env(context)
+
+
+def after_all(context) -> None:
+    """
+    Clean up the environment after the tests. This function is automatically called after all the tests.
+    """
+    _clear_environment_variables()
+    shutil.rmtree(context.temp_dir)
+
+
+def before_scenario(context, scenario) -> None:
+    if "skip" in scenario.effective_tags:
+        scenario.skip("Marked with @skip")
+        return
+
+
+def after_scenario(context, scenario) -> None:
+    """
+    Clean up the environment after each scenario. This function is automatically called after each scenario.
+    Steps must set the callback function using set_after_scenario_fn() to perform any cleanup.
+    """
+    if hasattr(context, 'after_scenario_fn'):
+        context.after_scenario_fn()
+
+
+def _set_verbose_mode(context) -> None:
+    verbose_enabled = getattr(context.config, 'verbose', False)
+    if verbose_enabled:
+        context.config.stdout_capture = False
+        context.config.stderr_capture = False
+    os.environ['E2E_TEST_VERBOSE_MODE'] = str(verbose_enabled).lower()
+
+
+def _set_environment_variables(context) -> None:
+    """
+    Set environment variables needed for the virtual environment setup.
+    """
+    tools_version = Utilities.get_base_release()
+    scala_version = context.config.userdata.get('scala_version')
+    venv_name = context.config.userdata.get('venv_name')
+    jar_filename = f'rapids-4-spark-tools_{scala_version}-{tools_version}-SNAPSHOT.jar'
+    build_jar_value = context.config.userdata.get('build_jar')
+    build_jar = build_jar_value.lower() in ['true', '1', 'yes']
+
+    os.environ['E2E_TEST_TOOLS_DIR'] = E2ETestUtils.get_tools_root_path()
+    os.environ['E2E_TEST_SCRIPTS_DIR'] = os.path.join(E2ETestUtils.get_e2e_tests_resource_path(), 'scripts')
+    os.environ['E2E_TEST_TOOLS_JAR_PATH'] = os.path.join(os.environ['E2E_TEST_TOOLS_DIR'],
+                                                         f'core/target/{jar_filename}')
+    os.environ['E2E_TEST_VENV_DIR'] = os.path.join(context.temp_dir, venv_name)
+    os.environ['E2E_TEST_BUILD_JAR'] = 'true' if build_jar else 'false'
+    os.environ['E2E_TEST_SPARK_BUILD_VERSION'] = context.config.userdata.get('buildver')
+    os.environ['E2E_TEST_HADOOP_VERSION'] = context.config.userdata.get('hadoop.version')
+    os.environ['E2E_TEST_TMP_DIR'] = context.config.userdata.get('e2e_tests_tmp_dir')
+
+
+def _setup_env(context) -> None:
+    """
+    Build the JAR and set up the virtual environment for the tests.
+    """
+    script_file_name = context.config.userdata.get('setup_script_file')
+    script = os.path.join(os.environ['E2E_TEST_SCRIPTS_DIR'], script_file_name)
+    try:
+        warning_msg = "Setting up the virtual environment for the tests. This may take a while."
+        if os.environ.get('BUILD_JAR') == 'true':
+            warning_msg = f'Building JAR and {warning_msg}'
+        logger.warning(warning_msg)
+        result = E2ETestUtils.run_sys_cmd([script])
+        E2ETestUtils.assert_sys_cmd_return_code(result,
+                                                exp_return_code=0,
+                                                error_msg="Failed to create virtual environment")
+    except Exception as e:  # pylint: disable=broad-except
+        raise RuntimeError(f"Failed to create virtual environment. Reason: {str(e)}") from e
+
+
+def _clear_environment_variables() -> None:
+    """
+    Clear environment variables set for the virtual environment setup.
+    """
+    env_vars = ['SCRIPTS_DIR', 'VENV_DIR', 'TOOLS_JAR_PATH']
+    for key in env_vars:
+        os.environ.pop(key, None)