From 71f8fafe9f99187359646459214afc82474a1e73 Mon Sep 17 00:00:00 2001
From: Michael Demoret <mdemoret@nvidia.com>
Date: Thu, 30 Nov 2023 14:49:05 -0700
Subject: [PATCH 1/2] Fixing the hammah and phishing pipelines

---
 .../phishing-email-validation-data.jsonlines                | 4 ++--
 morpheus.code-workspace                                     | 3 +++
 morpheus/cli/commands.py                                    | 6 ++++++
 scripts/validation/val-run-pipeline.sh                      | 6 +++---
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/models/datasets/validation-data/phishing-email-validation-data.jsonlines b/models/datasets/validation-data/phishing-email-validation-data.jsonlines
index d26906a546..714f550bfb 100644
--- a/models/datasets/validation-data/phishing-email-validation-data.jsonlines
+++ b/models/datasets/validation-data/phishing-email-validation-data.jsonlines
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:573a2532228f71c26c5497f4cb70fe6bd883c9149d9ce03bf8eea0e7f36c4005
-size 118096
+oid sha256:3cdf4fcb39af89f34006de9de03cafb9f4f3d3dadc8812552480e01506d04f8a
+size 125873
diff --git a/morpheus.code-workspace b/morpheus.code-workspace
index fa49454fce..273057e0d5 100644
--- a/morpheus.code-workspace
+++ b/morpheus.code-workspace
@@ -129,13 +129,16 @@
                     "--model_max_batch_size=1024",
                     "--use_cpp=False",
                     "pipeline-ae",
+                    "--columns_file=morpheus/data/columns_ae_cloudtrail.txt",
                     "--userid_column_name=userIdentitysessionContextsessionIssueruserName",
                     "--userid_filter=user123",
+                    "--timestamp_column_name=event_dt",
                     "from-cloudtrail",
                     "--input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv",
                     "--max_files=200",
                     "train-ae",
                     "--train_data_glob=models/datasets/training-data/dfp-*.csv",
+                    "--source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage",
                     "--seed=42",
                     "preprocess",
                     "inf-pytorch",
diff --git a/morpheus/cli/commands.py b/morpheus/cli/commands.py
index 687025a933..99df8e1ff5 100644
--- a/morpheus/cli/commands.py
+++ b/morpheus/cli/commands.py
@@ -482,6 +482,11 @@ def pipeline_fil(ctx: click.Context, **kwargs):
               type=click.Choice(RANKDIR_CHOICES, case_sensitive=False),
               help=("Set the direction for the Graphviz pipeline diagram, "
                     "ignored unless --viz_file is also specified."))
+@click.option('--timestamp_column_name',
+              type=str,
+              default="timestamp",
+              required=True,
+              help=("Which column to use as the timestamp."))
 @prepare_command()
 def pipeline_ae(ctx: click.Context, **kwargs):
     """
@@ -511,6 +516,7 @@ def pipeline_ae(ctx: click.Context, **kwargs):
 
     config.ae = ConfigAutoEncoder()
     config.ae.userid_column_name = kwargs["userid_column_name"]
+    config.ae.timestamp_column_name = kwargs["timestamp_column_name"]
     config.ae.feature_scaler = kwargs["feature_scaler"]
     config.ae.use_generic_model = kwargs["use_generic_model"]
     config.ae.feature_columns = load_labels_file(kwargs["columns_file"])
diff --git a/scripts/validation/val-run-pipeline.sh b/scripts/validation/val-run-pipeline.sh
index 3789a7fc87..97882e888a 100755
--- a/scripts/validation/val-run-pipeline.sh
+++ b/scripts/validation/val-run-pipeline.sh
@@ -107,7 +107,7 @@ function run_pipeline_phishing_email(){
       preprocess --vocab_hash_file=${MORPHEUS_ROOT}/morpheus/data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \
       ${INFERENCE_STAGE} \
       monitor --description "Inference Rate" --smoothing=0.001 --unit inf \
-      add-class --label=pred --threshold=0.7 \
+      add-class --label=is_phishing --threshold=0.7 \
       validate --val_file_name=${VAL_FILE} --results_file_name=${VAL_OUTPUT} --overwrite \
       serialize \
       to-file --filename=${OUTPUT_FILE} --overwrite
@@ -122,7 +122,7 @@ function run_pipeline_hammah_user123(){
    VAL_OUTPUT=$5
 
    morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \
-      pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="user123" --userid_column_name="userIdentitysessionContextsessionIssueruserName" \
+      pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="user123" --userid_column_name="userIdentitysessionContextsessionIssueruserName" --timestamp_column_name="event_dt" \
       from-cloudtrail --input_glob="${MORPHEUS_ROOT}/models/datasets/validation-data/dfp-cloudtrail-*-input.csv" \
       train-ae --train_data_glob="${MORPHEUS_ROOT}/models/datasets/training-data/dfp-cloudtrail-*.csv" --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage --seed 42 \
       preprocess \
@@ -144,7 +144,7 @@ function run_pipeline_hammah_role-g(){
    VAL_OUTPUT=$5
 
    morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \
-      pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="role-g" --userid_column_name="userIdentitysessionContextsessionIssueruserName" \
+      pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="role-g" --userid_column_name="userIdentitysessionContextsessionIssueruserName" --timestamp_column_name="event_dt" \
       from-cloudtrail --input_glob="${MORPHEUS_ROOT}/models/datasets/validation-data/dfp-cloudtrail-*-input.csv" \
       train-ae --train_data_glob="${MORPHEUS_ROOT}/models/datasets/training-data/dfp-cloudtrail-*.csv" --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage  --seed 42 \
       preprocess \

From 84f44346bcd52015cb3a4dfba947d75464721b38 Mon Sep 17 00:00:00 2001
From: Michael Demoret <mdemoret@nvidia.com>
Date: Thu, 30 Nov 2023 17:11:31 -0700
Subject: [PATCH 2/2] Fixing up the phishing labels for tests

---
 tests/benchmarks/test_bench_e2e_pipelines.py | 2 +-
 tests/test_phishing.py                       | 9 +++++----
 tests/test_phishing_kafka.py                 | 9 +++++----
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/tests/benchmarks/test_bench_e2e_pipelines.py b/tests/benchmarks/test_bench_e2e_pipelines.py
index 76637614f8..746fa555b0 100644
--- a/tests/benchmarks/test_bench_e2e_pipelines.py
+++ b/tests/benchmarks/test_bench_e2e_pipelines.py
@@ -186,7 +186,7 @@ def test_phishing_nlp_e2e(benchmark, tmp_path):
     config.model_max_batch_size = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["model_max_batch_size"]
     config.feature_length = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["feature_length"]
     config.edge_buffer_size = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["edge_buffer_size"]
-    config.class_labels = ["score", "pred"]
+    config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
     CppConfig.set_should_use_cpp(True)
 
     input_filepath = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["file_path"]
diff --git a/tests/test_phishing.py b/tests/test_phishing.py
index ea53dc56e6..16ccc371f6 100755
--- a/tests/test_phishing.py
+++ b/tests/test_phishing.py
@@ -34,6 +34,7 @@
 from morpheus.stages.postprocess.validation_stage import ValidationStage
 from morpheus.stages.preprocess.deserialize_stage import DeserializeStage
 from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage
+from morpheus.utils.file_utils import load_labels_file
 
 # End-to-end test intended to imitate the Phishing validation test
 FEATURE_LENGTH = 128
@@ -71,7 +72,7 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path):
     mock_triton_client.async_infer.side_effect = async_infer
 
     config.mode = PipelineModes.NLP
-    config.class_labels = ["score", "pred"]
+    config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
     config.model_max_batch_size = MODEL_MAX_BATCH_SIZE
     config.pipeline_batch_size = 1024
     config.feature_length = FEATURE_LENGTH
@@ -96,7 +97,7 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path):
         TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000',
                              force_convert_inputs=True))
     pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
-    pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7))
+    pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7))
     pipe.add_stage(
         ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05))
     pipe.add_stage(SerializeStage(config))
@@ -112,7 +113,7 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path):
 @pytest.mark.usefixtures("launch_mock_triton")
 def test_email_cpp(config, tmp_path):
     config.mode = PipelineModes.NLP
-    config.class_labels = ["score", "pred"]
+    config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
     config.model_max_batch_size = MODEL_MAX_BATCH_SIZE
     config.pipeline_batch_size = 1024
     config.feature_length = FEATURE_LENGTH
@@ -139,7 +140,7 @@ def test_email_cpp(config, tmp_path):
                              server_url='localhost:8001',
                              force_convert_inputs=True))
     pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
-    pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7))
+    pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7))
     pipe.add_stage(
         ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05))
     pipe.add_stage(SerializeStage(config))
diff --git a/tests/test_phishing_kafka.py b/tests/test_phishing_kafka.py
index 8de827aef9..f3bd41ec34 100755
--- a/tests/test_phishing_kafka.py
+++ b/tests/test_phishing_kafka.py
@@ -41,6 +41,7 @@
 from morpheus.stages.preprocess.deserialize_stage import DeserializeStage
 from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage
 from morpheus.utils.compare_df import compare_df
+from morpheus.utils.file_utils import load_labels_file
 
 if (typing.TYPE_CHECKING):
     from kafka import KafkaConsumer
@@ -87,7 +88,7 @@ def test_email_no_cpp(mock_triton_client: mock.MagicMock,
     mock_triton_client.async_infer.side_effect = async_infer
 
     config.mode = PipelineModes.NLP
-    config.class_labels = ["score", "pred"]
+    config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
     config.model_max_batch_size = MODEL_MAX_BATCH_SIZE
     config.pipeline_batch_size = 1024
     config.feature_length = FEATURE_LENGTH
@@ -120,7 +121,7 @@ def test_email_no_cpp(mock_triton_client: mock.MagicMock,
         TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000',
                              force_convert_inputs=True))
     pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
-    pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7))
+    pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7))
     pipe.add_stage(SerializeStage(config))
     pipe.add_stage(
         WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic))
@@ -154,7 +155,7 @@ def test_email_cpp(dataset_pandas: DatasetManager,
                    kafka_topics: KafkaTopics,
                    kafka_consumer: "KafkaConsumer"):
     config.mode = PipelineModes.NLP
-    config.class_labels = ["score", "pred"]
+    config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
     config.model_max_batch_size = MODEL_MAX_BATCH_SIZE
     config.pipeline_batch_size = 1024
     config.feature_length = FEATURE_LENGTH
@@ -187,7 +188,7 @@ def test_email_cpp(dataset_pandas: DatasetManager,
                              server_url='localhost:8001',
                              force_convert_inputs=True))
     pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
-    pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7))
+    pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7))
     pipe.add_stage(SerializeStage(config))
     pipe.add_stage(
         WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic))