nv-morpheus · rapids-bot · Dec 1, 2023 · Nov 30, 2023 · Dec 1, 2023
@@ -129,13 +129,16 @@
                     "--model_max_batch_size=1024",
                     "--use_cpp=False",
                     "pipeline-ae",
+                    "--columns_file=morpheus/data/columns_ae_cloudtrail.txt",
                     "--userid_column_name=userIdentitysessionContextsessionIssueruserName",
                     "--userid_filter=user123",
+                    "--timestamp_column_name=event_dt",
                     "from-cloudtrail",
                     "--input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv",
                     "--max_files=200",
                     "train-ae",
                     "--train_data_glob=models/datasets/training-data/dfp-*.csv",
+                    "--source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage",
                     "--seed=42",
                     "preprocess",
                     "inf-pytorch",

@@ -482,6 +482,11 @@ def pipeline_fil(ctx: click.Context, **kwargs):
               type=click.Choice(RANKDIR_CHOICES, case_sensitive=False),
               help=("Set the direction for the Graphviz pipeline diagram, "
                     "ignored unless --viz_file is also specified."))
+@click.option('--timestamp_column_name',
+              type=str,
+              default="timestamp",
+              required=True,
+              help=("Which column to use as the timestamp."))
 @prepare_command()
 def pipeline_ae(ctx: click.Context, **kwargs):
     """
@@ -511,6 +516,7 @@ def pipeline_ae(ctx: click.Context, **kwargs):
 
     config.ae = ConfigAutoEncoder()
     config.ae.userid_column_name = kwargs["userid_column_name"]
+    config.ae.timestamp_column_name = kwargs["timestamp_column_name"]
     config.ae.feature_scaler = kwargs["feature_scaler"]
     config.ae.use_generic_model = kwargs["use_generic_model"]
     config.ae.feature_columns = load_labels_file(kwargs["columns_file"])

@@ -107,7 +107,7 @@ function run_pipeline_phishing_email(){
       preprocess --vocab_hash_file=${MORPHEUS_ROOT}/morpheus/data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \
       ${INFERENCE_STAGE} \
       monitor --description "Inference Rate" --smoothing=0.001 --unit inf \
-      add-class --label=pred --threshold=0.7 \
+      add-class --label=is_phishing --threshold=0.7 \
       validate --val_file_name=${VAL_FILE} --results_file_name=${VAL_OUTPUT} --overwrite \
       serialize \
       to-file --filename=${OUTPUT_FILE} --overwrite
@@ -122,7 +122,7 @@ function run_pipeline_hammah_user123(){
    VAL_OUTPUT=$5
 
    morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \
-      pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="user123" --userid_column_name="userIdentitysessionContextsessionIssueruserName" \
+      pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="user123" --userid_column_name="userIdentitysessionContextsessionIssueruserName" --timestamp_column_name="event_dt" \
       from-cloudtrail --input_glob="${MORPHEUS_ROOT}/models/datasets/validation-data/dfp-cloudtrail-*-input.csv" \
       train-ae --train_data_glob="${MORPHEUS_ROOT}/models/datasets/training-data/dfp-cloudtrail-*.csv" --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage --seed 42 \
       preprocess \
@@ -144,7 +144,7 @@ function run_pipeline_hammah_role-g(){
    VAL_OUTPUT=$5
 
    morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \
-      pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="role-g" --userid_column_name="userIdentitysessionContextsessionIssueruserName" \
+      pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="role-g" --userid_column_name="userIdentitysessionContextsessionIssueruserName" --timestamp_column_name="event_dt" \
       from-cloudtrail --input_glob="${MORPHEUS_ROOT}/models/datasets/validation-data/dfp-cloudtrail-*-input.csv" \
       train-ae --train_data_glob="${MORPHEUS_ROOT}/models/datasets/training-data/dfp-cloudtrail-*.csv" --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage  --seed 42 \
       preprocess \

@@ -186,7 +186,7 @@ def test_phishing_nlp_e2e(benchmark, tmp_path):
     config.model_max_batch_size = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["model_max_batch_size"]
     config.feature_length = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["feature_length"]
     config.edge_buffer_size = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["edge_buffer_size"]
-    config.class_labels = ["score", "pred"]
+    config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
     CppConfig.set_should_use_cpp(True)
 
     input_filepath = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["file_path"]

@@ -34,6 +34,7 @@
 from morpheus.stages.postprocess.validation_stage import ValidationStage
 from morpheus.stages.preprocess.deserialize_stage import DeserializeStage
 from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage
+from morpheus.utils.file_utils import load_labels_file
 
 # End-to-end test intended to imitate the Phishing validation test
 FEATURE_LENGTH = 128
@@ -71,7 +72,7 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path):
     mock_triton_client.async_infer.side_effect = async_infer
 
     config.mode = PipelineModes.NLP
-    config.class_labels = ["score", "pred"]
+    config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
     config.model_max_batch_size = MODEL_MAX_BATCH_SIZE
     config.pipeline_batch_size = 1024
     config.feature_length = FEATURE_LENGTH
@@ -96,7 +97,7 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path):
         TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000',
                              force_convert_inputs=True))
     pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
-    pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7))
+    pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7))
     pipe.add_stage(
         ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05))
     pipe.add_stage(SerializeStage(config))
@@ -112,7 +113,7 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path):
 @pytest.mark.usefixtures("launch_mock_triton")
 def test_email_cpp(config, tmp_path):
     config.mode = PipelineModes.NLP
-    config.class_labels = ["score", "pred"]
+    config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
     config.model_max_batch_size = MODEL_MAX_BATCH_SIZE
     config.pipeline_batch_size = 1024
     config.feature_length = FEATURE_LENGTH
@@ -139,7 +140,7 @@ def test_email_cpp(config, tmp_path):
                              server_url='localhost:8001',
                              force_convert_inputs=True))
     pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
-    pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7))
+    pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7))
     pipe.add_stage(
         ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05))
     pipe.add_stage(SerializeStage(config))

@@ -41,6 +41,7 @@
 from morpheus.stages.preprocess.deserialize_stage import DeserializeStage
 from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage
 from morpheus.utils.compare_df import compare_df
+from morpheus.utils.file_utils import load_labels_file
 
 if (typing.TYPE_CHECKING):
     from kafka import KafkaConsumer
@@ -87,7 +88,7 @@ def test_email_no_cpp(mock_triton_client: mock.MagicMock,
     mock_triton_client.async_infer.side_effect = async_infer
 
     config.mode = PipelineModes.NLP
-    config.class_labels = ["score", "pred"]
+    config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
     config.model_max_batch_size = MODEL_MAX_BATCH_SIZE
     config.pipeline_batch_size = 1024
     config.feature_length = FEATURE_LENGTH
@@ -120,7 +121,7 @@ def test_email_no_cpp(mock_triton_client: mock.MagicMock,
         TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000',
                              force_convert_inputs=True))
     pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
-    pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7))
+    pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7))
     pipe.add_stage(SerializeStage(config))
     pipe.add_stage(
         WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic))
@@ -154,7 +155,7 @@ def test_email_cpp(dataset_pandas: DatasetManager,
                    kafka_topics: KafkaTopics,
                    kafka_consumer: "KafkaConsumer"):
     config.mode = PipelineModes.NLP
-    config.class_labels = ["score", "pred"]
+    config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
     config.model_max_batch_size = MODEL_MAX_BATCH_SIZE
     config.pipeline_batch_size = 1024
     config.feature_length = FEATURE_LENGTH
@@ -187,7 +188,7 @@ def test_email_cpp(dataset_pandas: DatasetManager,
                              server_url='localhost:8001',
                              force_convert_inputs=True))
     pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
-    pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7))
+    pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7))
     pipe.add_stage(SerializeStage(config))
     pipe.add_stage(
         WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic))