Tests for gnn_fraud_detection_pipeline & ransomware_detection (#904)

* Renamed `examples/gnn_fraud_detection_pipeline/requirements.yml` to `docker/conda/environments/cuda11.8_examples.yml`, replacing the original with a symlink. * Test stage includes additional packages needed for gnn_fraud_detection_pipeline & ransomware_detection pipelines * Locally pytest will skip these if the deps are missing * Add a new `--fail_missing` flag which will cause tests to fail instead of skip on missing deps * Remove unused redundant apt install of nodejs & npm * Use `openjdk-11-jre-headless` for Kafka instead of `openjdk-11-jdk`, removing un-needed deps from the image like alsa and GL. gnn_fraud_detection_pipeline changes: * Perform pre-allocation of needed columns in `ClassificationStage` * Replace deprecated usage of `StellarGraph` constructor in `FraudGraphConstructionStage` with `StellarGraph.from_networkx` * Work-around Stellargraph/Python 3.10 incompatibility fixes #907 ransomware_detection changes: * Explicitly exclude `ldrmodules_df_path` from `model_features` * Document that C++ execution is currently unsupported * Move nested methods in `CreateFeaturesRWStage` to methods on the class allowing them to be tested * Perform pre-allocation of needed columns in `PreprocessingRWStage` * Update dependencies for training script, and update due to API changes in Tensorflow Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: #904
nv-morpheus · May 15, 2023 · cef498a · cef498a
1 parent 79c5f2a
commit cef498a
Show file tree

Hide file tree

Showing 42 changed files with 1,299 additions and 296 deletions.
diff --git a/.github/workflows/ci_pipe.yml b/.github/workflows/ci_pipe.yml
@@ -133,6 +133,7 @@ jobs:
       env:
         NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
         PARALLEL_LEVEL: '10'
+        MERGE_EXAMPLES_YAML: '1'
     strategy:
       fail-fast: true
 
@@ -164,6 +165,8 @@ jobs:
         username: '$oauthtoken'
         password: ${{ secrets.NGC_API_KEY }}
       image: ${{ inputs.container }}
+      env:
+        MERGE_DOCS_YAML: '1'
     strategy:
       fail-fast: true
 

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
@@ -46,7 +46,7 @@ jobs:
     uses: ./.github/workflows/ci_pipe.yml
     with:
       run_check: ${{ startsWith(github.ref_name, 'pull-request/') }}
-      container: nvcr.io/ea-nvidia-morpheus/morpheus:morpheus-ci-build-230414
-      test_container: nvcr.io/ea-nvidia-morpheus/morpheus:morpheus-ci-test-230414
+      container: nvcr.io/ea-nvidia-morpheus/morpheus:morpheus-ci-build-230510
+      test_container: nvcr.io/ea-nvidia-morpheus/morpheus:morpheus-ci-test-230510
     secrets:
       NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
diff --git a/ci/runner/Dockerfile b/ci/runner/Dockerfile
@@ -45,6 +45,7 @@ RUN apt update && \
 COPY ./docker/conda/environments/* /tmp/conda/
 
 RUN CONDA_ALWAYS_YES=true /opt/conda/bin/mamba env create -n ${PROJ_NAME} -q --file /tmp/conda/cuda${CUDA_SHORT_VER}_dev.yml && \
+    /opt/conda/bin/mamba install -n morpheus -c conda-forge "conda-merge>=0.2" && \
     sed -i "s/conda activate base/conda activate ${PROJ_NAME}/g" ~/.bashrc && \
     conda clean -afy && \
     rm -rf /tmp/conda
@@ -68,25 +69,33 @@ RUN apt update && \
     apt clean && \
     rm -rf /var/lib/apt/lists/*
 
-
 # ============ test ==================
 FROM base as test
 
 # Add any test only dependencies here.
 
 ARG PROJ_NAME
+ARG CUDA_SHORT_VER
 
 RUN apt update && \
     DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC \
     apt install --no-install-recommends -y \
-        nodejs \
-        npm \
-        openjdk-11-jdk && \
+        openjdk-11-jre-headless && \
     apt clean && \
     rm -rf /var/lib/apt/lists/*
 
+COPY ./docker/conda/environments/cuda${CUDA_SHORT_VER}_examples.yml /tmp/conda/cuda${CUDA_SHORT_VER}_examples.yml
+
+# Install extra deps needed for gnn_fraud_detection_pipeline & ransomware_detection examples
+RUN CONDA_ALWAYS_YES=true /opt/conda/bin/mamba env update -n ${PROJ_NAME} -q --file /tmp/conda/cuda${CUDA_SHORT_VER}_examples.yml && \
+    conda clean -afy && \
+    source activate ${PROJ_NAME} && \
+    pip install --ignore-requires-python stellargraph==1.2.1 && \
+    rm -rf /tmp/conda
+
 # Install camouflage needed for unittests to mock a triton server
-RUN npm install -g camouflage-server@0.9 && \
+RUN source activate ${PROJ_NAME} && \
+    npm install -g camouflage-server@0.9 && \
     npm cache clean --force
 
 # Install pytest-kafka

diff --git a/ci/scripts/github/common.sh b/ci/scripts/github/common.sh
@@ -60,26 +60,50 @@ export SCCACHE_REGION="us-east-2"
 export SCCACHE_IDLE_TIMEOUT=32768
 #export SCCACHE_LOG=debug
 
+export CONDA_ENV_YML=${MORPHEUS_ROOT}/docker/conda/environments/cuda${CUDA_VER}_dev.yml
+export CONDA_EXAMPLES_YML=${MORPHEUS_ROOT}/docker/conda/environments/cuda${CUDA_VER}_examples.yml
+export CONDA_DOCS_YML=${MORPHEUS_ROOT}/docs/conda_docs.yml
+export PIP_REQUIREMENTS=${MORPHEUS_ROOT}/docker/conda/environments/requirements.txt
+
 export CMAKE_BUILD_ALL_FEATURES="-DCMAKE_MESSAGE_CONTEXT_SHOW=ON -DMORPHEUS_CUDA_ARCHITECTURES=60;70;75;80 -DMORPHEUS_BUILD_BENCHMARKS=ON -DMORPHEUS_BUILD_EXAMPLES=ON -DMORPHEUS_BUILD_TESTS=ON -DMORPHEUS_USE_CONDA=ON -DMORPHEUS_PYTHON_INPLACE_BUILD=OFF -DMORPHEUS_PYTHON_BUILD_STUBS=ON -DMORPHEUS_USE_CCACHE=ON"
 
 export FETCH_STATUS=0
 
 print_env_vars
 
 function update_conda_env() {
-    rapids-logger "Checking for updates to conda env"
-
     # Deactivate the environment first before updating
     conda deactivate
 
-    # Update the packages with --prune to remove any extra packages
-    rapids-mamba-retry env update -n morpheus --prune -q --file ${MORPHEUS_ROOT}/docker/conda/environments/cuda${CUDA_VER}_dev.yml
+    ENV_YAML=${CONDA_ENV_YML}
+    if [[ "${MERGE_EXAMPLES_YAML}" == "1" || "${MERGE_DOCS_YAML}" == "1" ]]; then
+        # Merge the dev, docs and examples envs, otherwise --prune will remove the examples packages
+        ENV_YAML=${condatmpdir}/merged_env.yml
+        YAMLS="${CONDA_ENV_YML}"
+        if [[ "${MERGE_EXAMPLES_YAML}" == "1" ]]; then
+            YAMLS="${YAMLS} ${CONDA_EXAMPLES_YML}"
+        fi
+        if [[ "${MERGE_DOCS_YAML}" == "1" ]]; then
+            YAMLS="${YAMLS} ${CONDA_DOCS_YML}"
+        fi
+
+        # Conda is going to expect a requirements.txt file to be in the same directory as the env yaml
+        cp ${PIP_REQUIREMENTS} ${condatmpdir}/requirements.txt
+
+        rapids-logger "Merging conda envs: ${YAMLS}"
+        conda run -n morpheus --live-stream conda-merge ${YAMLS} > ${ENV_YAML}
+    fi
+
+    rapids-logger "Checking for updates to conda env"
+
+    # Update the packages
+    rapids-mamba-retry env update -n morpheus --prune -q --file ${ENV_YAML}
 
     # Finally, reactivate
     conda activate morpheus
 
     rapids-logger "Final Conda Environment"
-    conda list
+    show_conda_info
 }
 
 function fetch_base_branch() {
@@ -100,36 +124,6 @@ function fetch_base_branch() {
     rapids-logger "Base branch: ${BASE_BRANCH}"
 }
 
-function fetch_s3() {
-    ENDPOINT=$1
-    DESTINATION=$2
-    if [[ "${USE_S3_CURL}" == "1" ]]; then
-        curl -f "${DISPLAY_URL}${ENDPOINT}" -o "${DESTINATION}"
-        FETCH_STATUS=$?
-    else
-        aws s3 cp --no-progress "${S3_URL}${ENDPOINT}" "${DESTINATION}"
-        FETCH_STATUS=$?
-    fi
-}
-
-function restore_conda_env() {
-
-    rapids-logger "Downloading build artifacts from ${DISPLAY_ARTIFACT_URL}"
-    fetch_s3 "${ARTIFACT_ENDPOINT}/conda_env.tar.gz" "${WORKSPACE_TMP}/conda_env.tar.gz"
-    fetch_s3 "${ARTIFACT_ENDPOINT}/wheel.tar.bz" "${WORKSPACE_TMP}/wheel.tar.bz"
-
-    rapids-logger "Extracting"
-    mkdir -p /opt/conda/envs/morpheus
-
-    # We are using the --no-same-owner flag since user id & group id's are inconsistent between nodes in our CI pool
-    tar xf "${WORKSPACE_TMP}/conda_env.tar.gz" --no-same-owner --directory /opt/conda/envs/morpheus
-    tar xf "${WORKSPACE_TMP}/wheel.tar.bz" --no-same-owner --directory ${MORPHEUS_ROOT}
-
-    rapids-logger "Setting conda env"
-    conda activate morpheus
-    conda-unpack
-}
-
 function show_conda_info() {
 
     rapids-logger "Check Conda info"

diff --git a/ci/scripts/github/docs.sh b/ci/scripts/github/docs.sh
@@ -32,9 +32,6 @@ cd ${MORPHEUS_ROOT}
 git lfs install
 ${MORPHEUS_ROOT}/scripts/fetch_data.py fetch docs examples
 
-rapids-logger "Installing Documentation dependencies"
-mamba env update -f ${MORPHEUS_ROOT}/docs/conda_docs.yml
-
 git submodule update --init --recursive
 
 rapids-logger "Configuring for docs"

diff --git a/ci/scripts/github/test.sh b/ci/scripts/github/test.sh
@@ -69,7 +69,7 @@ done
 rapids-logger "Running Python tests"
 set +e
 
-python -I -m pytest --run_slow --run_kafka \
+python -I -m pytest --run_slow --run_kafka --fail_missing \
        --junit-xml=${REPORTS_DIR}/report_pytest.xml \
        --cov=morpheus \
        --cov-report term-missing \

diff --git a/docker/conda/environments/cuda11.8_examples.yml b/docker/conda/environments/cuda11.8_examples.yml
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Additional dependencies needed by a some of the Morpheus examples.
+# The intended usage is to first create the conda environment from the `cuda11.8_dev.yml` file, and then update the
+# env with this file. ex:
+#   mamba env create -n  morpheus --file docker/conda/environments/cuda11.8_dev.yml
+#   conda activate morpheus
+#   mamba env update -n morpheus --file docker/conda/environments/cuda11.8_examples.yml
+channels:
+    - rapidsai
+    - nvidia
+    - conda-forge
+dependencies:
+    - chardet=5.0.0
+    - cuml=23.02
+    - dask==2023.1.1
+    - distributed==2023.1.1
+    - pip
+    - pip:
+        # tensorflow exists in conda-forge but is tied to CUDA-11.3
+        - tensorflow==2.12.0
diff --git a/docs/conda_docs.yml b/docs/conda_docs.yml
@@ -21,5 +21,10 @@ dependencies:
     - pip
     ####### Morpheus Pip Dependencies (keep sorted!) #######
     - pip:
-        # Ensure all runtime requirements are installed using the requirements file
-        - --requirement requirements.txt
+        - breathe==4.34.0
+        - exhale==0.3.6
+        - ipython
+        - myst-parser==0.17.2
+        - nbsphinx
+        - sphinx
+        - sphinx_rtd_theme
diff --git a/docs/requirements.txt b/docs/requirements.txt
diff --git a/examples/gnn_fraud_detection_pipeline/README.md b/examples/gnn_fraud_detection_pipeline/README.md
@@ -22,8 +22,11 @@ Prior to running the GNN fraud detection pipeline, additional requirements must
 
 ```bash
 mamba env update -n ${CONDA_DEFAULT_ENV} -f examples/gnn_fraud_detection_pipeline/requirements.yml
+pip install --ignore-requires-python stellargraph==1.2.1
 ```
 
+> **Note**: The `--ignore-requires-python` is needed because Stellargraph only officially supports Python versions prior to 3.9 ([stellargraph/stellargraph#1960](https://github.com/stellargraph/stellargraph/issues/1960)).
+
 ## Running
 
 ##### Setup Env Variable

diff --git a/examples/gnn_fraud_detection_pipeline/requirements.yml b/examples/gnn_fraud_detection_pipeline/requirements.yml
@@ -20,7 +20,9 @@ channels:
 dependencies:
     - chardet=5.0.0
     - cuml=23.02
+    - dask==2023.1.1
+    - distributed==2023.1.1
+    - pip
     - pip:
         # tensorflow exists in conda-forge but is tied to CUDA-11.3
-        - stellargraph==1.2.1
         - tensorflow==2.12.0
diff --git a/examples/gnn_fraud_detection_pipeline/stages/classification_stage.py b/examples/gnn_fraud_detection_pipeline/stages/classification_stage.py
@@ -21,6 +21,7 @@
 import cuml
 
 from morpheus.cli.register_stage import register_stage
+from morpheus.common import TypeId
 from morpheus.config import Config
 from morpheus.config import PipelineModes
 from morpheus.messages import MultiMessage
@@ -48,6 +49,7 @@ def __init__(self, c: Config, model_xgb_file: str):
         super().__init__(c)
 
         self._xgb_model = cuml.ForestInference.load(model_xgb_file, output_class=True)
+        self._needed_columns.update({'node_id': TypeId.INT64, 'prediction': TypeId.FLOAT32})
 
     @property
     def name(self) -> str:
@@ -61,9 +63,11 @@ def supports_cpp_node(self):
 
     def _process_message(self, message: GraphSAGEMultiMessage):
         ind_emb_columns = message.get_meta(message.inductive_embedding_column_names)
-
         message.set_meta("node_id", message.node_identifiers)
 
+        # The XGBoost model is returning two probabilities for the binary classification. The first (column 0) is
+        # probability that the transaction is in the benign class, and the second (column 1) is the probability that
+        # the transaction is in the fraudulent class. Added together the two values will always equal 1.
         prediction = self._xgb_model.predict_proba(ind_emb_columns).iloc[:, 1]
 
         message.set_meta("prediction", prediction)

diff --git a/examples/gnn_fraud_detection_pipeline/stages/graph_construction_stage.py b/examples/gnn_fraud_detection_pipeline/stages/graph_construction_stage.py
@@ -90,7 +90,7 @@ def _graph_construction(nodes, edges, node_features) -> "stellargraph.StellarGra
         for edge in edges:
             g_nx.add_edges_from(edge)
 
-        return StellarGraph(g_nx, node_type_name="ntype", node_features=node_features)
+        return StellarGraph.from_networkx(g_nx, node_type_attr='ntype', node_features=node_features)
 
     @staticmethod
     def _build_graph_features(dataset: pd.DataFrame) -> "stellargraph.StellarGraph":

diff --git a/examples/ransomware_detection/config/ransomware_detection.yaml b/examples/ransomware_detection/config/ransomware_detection.yaml
@@ -13,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
---- 
-file_extensions: 
+---
+file_extensions:
   - doc
   - docx
   - html
@@ -38,7 +38,7 @@ file_extensions:
   - 7z
   - rar
   - msg
-model_features: 
+model_features:
   - envirs_pathext
   - count_double_extension_count_handles
   - page_readonly_vads_count
@@ -138,8 +138,9 @@ model_features:
   - page_execute_readwrite_vads_count
   - handles_df_type_unique_ratio
   - page_execute_readwrite_count
+features:
   - ldrmodules_df_path
-raw_columns: 
+raw_columns:
   - Base
   - Block
   - CommitCharge

diff --git a/examples/ransomware_detection/run.py b/examples/ransomware_detection/run.py
@@ -36,7 +36,7 @@
 
 @click.command()
 @click.option('--debug', default=False)
-@click.option('--use_cpp', default=False)
+@click.option('--use_cpp', default=False, help="Enable C++ execution for this pipeline, currently this is unsupported.")
 @click.option(
     "--num_threads",
     default=os.cpu_count(),
@@ -147,7 +147,10 @@ def run_pipeline(debug,
     cols_interested_plugins = rwd_conf['raw_columns']
 
     # Feature columns used by the model.
-    feature_columns = rwd_conf['model_features']
+    model_features = rwd_conf['model_features']
+
+    # Features to include in the DF, superset of model_features along with a few that the model doesn't receive
+    feature_columns = model_features + rwd_conf['features']
 
     # File extensions.
     file_extns = rwd_conf['file_extensions']
@@ -185,8 +188,7 @@ def run_pipeline(debug,
 
     # Add preprocessing stage.
     # This stage generates snapshot sequences using sliding window for each pid_process.
-    pipeline.add_stage(PreprocessingRWStage(config, feature_columns=feature_columns[:-1],
-                                            sliding_window=sliding_window))
+    pipeline.add_stage(PreprocessingRWStage(config, feature_columns=model_features, sliding_window=sliding_window))
 
     # Add a monitor stage
     # This stage logs the metrics (msg/sec) from the above stage.