From 71f8fafe9f99187359646459214afc82474a1e73 Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Thu, 30 Nov 2023 14:49:05 -0700 Subject: [PATCH 1/2] Fixing the hammah and phishing pipelines --- .../phishing-email-validation-data.jsonlines | 4 ++-- morpheus.code-workspace | 3 +++ morpheus/cli/commands.py | 6 ++++++ scripts/validation/val-run-pipeline.sh | 6 +++--- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/models/datasets/validation-data/phishing-email-validation-data.jsonlines b/models/datasets/validation-data/phishing-email-validation-data.jsonlines index d26906a546..714f550bfb 100644 --- a/models/datasets/validation-data/phishing-email-validation-data.jsonlines +++ b/models/datasets/validation-data/phishing-email-validation-data.jsonlines @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:573a2532228f71c26c5497f4cb70fe6bd883c9149d9ce03bf8eea0e7f36c4005 -size 118096 +oid sha256:3cdf4fcb39af89f34006de9de03cafb9f4f3d3dadc8812552480e01506d04f8a +size 125873 diff --git a/morpheus.code-workspace b/morpheus.code-workspace index fa49454fce..273057e0d5 100644 --- a/morpheus.code-workspace +++ b/morpheus.code-workspace @@ -129,13 +129,16 @@ "--model_max_batch_size=1024", "--use_cpp=False", "pipeline-ae", + "--columns_file=morpheus/data/columns_ae_cloudtrail.txt", "--userid_column_name=userIdentitysessionContextsessionIssueruserName", "--userid_filter=user123", + "--timestamp_column_name=event_dt", "from-cloudtrail", "--input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv", "--max_files=200", "train-ae", "--train_data_glob=models/datasets/training-data/dfp-*.csv", + "--source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage", "--seed=42", "preprocess", "inf-pytorch", diff --git a/morpheus/cli/commands.py b/morpheus/cli/commands.py index 687025a933..99df8e1ff5 100644 --- a/morpheus/cli/commands.py +++ b/morpheus/cli/commands.py @@ -482,6 +482,11 @@ def pipeline_fil(ctx: click.Context, **kwargs): type=click.Choice(RANKDIR_CHOICES, case_sensitive=False), help=("Set the direction for the Graphviz pipeline diagram, " "ignored unless --viz_file is also specified.")) +@click.option('--timestamp_column_name', + type=str, + default="timestamp", + required=True, + help=("Which column to use as the timestamp.")) @prepare_command() def pipeline_ae(ctx: click.Context, **kwargs): """ @@ -511,6 +516,7 @@ def pipeline_ae(ctx: click.Context, **kwargs): config.ae = ConfigAutoEncoder() config.ae.userid_column_name = kwargs["userid_column_name"] + config.ae.timestamp_column_name = kwargs["timestamp_column_name"] config.ae.feature_scaler = kwargs["feature_scaler"] config.ae.use_generic_model = kwargs["use_generic_model"] config.ae.feature_columns = load_labels_file(kwargs["columns_file"]) diff --git a/scripts/validation/val-run-pipeline.sh b/scripts/validation/val-run-pipeline.sh index 3789a7fc87..97882e888a 100755 --- a/scripts/validation/val-run-pipeline.sh +++ b/scripts/validation/val-run-pipeline.sh @@ -107,7 +107,7 @@ function run_pipeline_phishing_email(){ preprocess --vocab_hash_file=${MORPHEUS_ROOT}/morpheus/data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \ ${INFERENCE_STAGE} \ monitor --description "Inference Rate" --smoothing=0.001 --unit inf \ - add-class --label=pred --threshold=0.7 \ + add-class --label=is_phishing --threshold=0.7 \ validate --val_file_name=${VAL_FILE} --results_file_name=${VAL_OUTPUT} --overwrite \ serialize \ to-file --filename=${OUTPUT_FILE} --overwrite @@ -122,7 +122,7 @@ function run_pipeline_hammah_user123(){ VAL_OUTPUT=$5 morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \ - pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="user123" --userid_column_name="userIdentitysessionContextsessionIssueruserName" \ + pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="user123" --userid_column_name="userIdentitysessionContextsessionIssueruserName" --timestamp_column_name="event_dt" \ from-cloudtrail --input_glob="${MORPHEUS_ROOT}/models/datasets/validation-data/dfp-cloudtrail-*-input.csv" \ train-ae --train_data_glob="${MORPHEUS_ROOT}/models/datasets/training-data/dfp-cloudtrail-*.csv" --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage --seed 42 \ preprocess \ @@ -144,7 +144,7 @@ function run_pipeline_hammah_role-g(){ VAL_OUTPUT=$5 morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \ - pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="role-g" --userid_column_name="userIdentitysessionContextsessionIssueruserName" \ + pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="role-g" --userid_column_name="userIdentitysessionContextsessionIssueruserName" --timestamp_column_name="event_dt" \ from-cloudtrail --input_glob="${MORPHEUS_ROOT}/models/datasets/validation-data/dfp-cloudtrail-*-input.csv" \ train-ae --train_data_glob="${MORPHEUS_ROOT}/models/datasets/training-data/dfp-cloudtrail-*.csv" --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage --seed 42 \ preprocess \ From 84f44346bcd52015cb3a4dfba947d75464721b38 Mon Sep 17 00:00:00 2001 From: Michael Demoret Date: Thu, 30 Nov 2023 17:11:31 -0700 Subject: [PATCH 2/2] Fixing up the phishing labels for tests --- tests/benchmarks/test_bench_e2e_pipelines.py | 2 +- tests/test_phishing.py | 9 +++++---- tests/test_phishing_kafka.py | 9 +++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/benchmarks/test_bench_e2e_pipelines.py b/tests/benchmarks/test_bench_e2e_pipelines.py index 76637614f8..746fa555b0 100644 --- a/tests/benchmarks/test_bench_e2e_pipelines.py +++ b/tests/benchmarks/test_bench_e2e_pipelines.py @@ -186,7 +186,7 @@ def test_phishing_nlp_e2e(benchmark, tmp_path): config.model_max_batch_size = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["model_max_batch_size"] config.feature_length = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["feature_length"] config.edge_buffer_size = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["edge_buffer_size"] - config.class_labels = ["score", "pred"] + config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt")) CppConfig.set_should_use_cpp(True) input_filepath = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["file_path"] diff --git a/tests/test_phishing.py b/tests/test_phishing.py index ea53dc56e6..16ccc371f6 100755 --- a/tests/test_phishing.py +++ b/tests/test_phishing.py @@ -34,6 +34,7 @@ from morpheus.stages.postprocess.validation_stage import ValidationStage from morpheus.stages.preprocess.deserialize_stage import DeserializeStage from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage +from morpheus.utils.file_utils import load_labels_file # End-to-end test intended to imitate the Phishing validation test FEATURE_LENGTH = 128 @@ -71,7 +72,7 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path): mock_triton_client.async_infer.side_effect = async_infer config.mode = PipelineModes.NLP - config.class_labels = ["score", "pred"] + config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt")) config.model_max_batch_size = MODEL_MAX_BATCH_SIZE config.pipeline_batch_size = 1024 config.feature_length = FEATURE_LENGTH @@ -96,7 +97,7 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path): TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000', force_convert_inputs=True)) pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) - pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7)) + pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7)) pipe.add_stage( ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05)) pipe.add_stage(SerializeStage(config)) @@ -112,7 +113,7 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path): @pytest.mark.usefixtures("launch_mock_triton") def test_email_cpp(config, tmp_path): config.mode = PipelineModes.NLP - config.class_labels = ["score", "pred"] + config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt")) config.model_max_batch_size = MODEL_MAX_BATCH_SIZE config.pipeline_batch_size = 1024 config.feature_length = FEATURE_LENGTH @@ -139,7 +140,7 @@ def test_email_cpp(config, tmp_path): server_url='localhost:8001', force_convert_inputs=True)) pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) - pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7)) + pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7)) pipe.add_stage( ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05)) pipe.add_stage(SerializeStage(config)) diff --git a/tests/test_phishing_kafka.py b/tests/test_phishing_kafka.py index 8de827aef9..f3bd41ec34 100755 --- a/tests/test_phishing_kafka.py +++ b/tests/test_phishing_kafka.py @@ -41,6 +41,7 @@ from morpheus.stages.preprocess.deserialize_stage import DeserializeStage from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage from morpheus.utils.compare_df import compare_df +from morpheus.utils.file_utils import load_labels_file if (typing.TYPE_CHECKING): from kafka import KafkaConsumer @@ -87,7 +88,7 @@ def test_email_no_cpp(mock_triton_client: mock.MagicMock, mock_triton_client.async_infer.side_effect = async_infer config.mode = PipelineModes.NLP - config.class_labels = ["score", "pred"] + config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt")) config.model_max_batch_size = MODEL_MAX_BATCH_SIZE config.pipeline_batch_size = 1024 config.feature_length = FEATURE_LENGTH @@ -120,7 +121,7 @@ def test_email_no_cpp(mock_triton_client: mock.MagicMock, TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000', force_convert_inputs=True)) pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) - pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7)) + pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7)) pipe.add_stage(SerializeStage(config)) pipe.add_stage( WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic)) @@ -154,7 +155,7 @@ def test_email_cpp(dataset_pandas: DatasetManager, kafka_topics: KafkaTopics, kafka_consumer: "KafkaConsumer"): config.mode = PipelineModes.NLP - config.class_labels = ["score", "pred"] + config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt")) config.model_max_batch_size = MODEL_MAX_BATCH_SIZE config.pipeline_batch_size = 1024 config.feature_length = FEATURE_LENGTH @@ -187,7 +188,7 @@ def test_email_cpp(dataset_pandas: DatasetManager, server_url='localhost:8001', force_convert_inputs=True)) pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) - pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7)) + pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7)) pipe.add_stage(SerializeStage(config)) pipe.add_stage( WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic))