Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing the hammah and phishing validation pipelines #1398

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Git LFS file not shown
3 changes: 3 additions & 0 deletions morpheus.code-workspace
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,16 @@
"--model_max_batch_size=1024",
"--use_cpp=False",
"pipeline-ae",
"--columns_file=morpheus/data/columns_ae_cloudtrail.txt",
"--userid_column_name=userIdentitysessionContextsessionIssueruserName",
"--userid_filter=user123",
"--timestamp_column_name=event_dt",
"from-cloudtrail",
"--input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv",
"--max_files=200",
"train-ae",
"--train_data_glob=models/datasets/training-data/dfp-*.csv",
"--source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage",
"--seed=42",
"preprocess",
"inf-pytorch",
Expand Down
6 changes: 6 additions & 0 deletions morpheus/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,11 @@ def pipeline_fil(ctx: click.Context, **kwargs):
type=click.Choice(RANKDIR_CHOICES, case_sensitive=False),
help=("Set the direction for the Graphviz pipeline diagram, "
"ignored unless --viz_file is also specified."))
@click.option('--timestamp_column_name',
type=str,
default="timestamp",
required=True,
help=("Which column to use as the timestamp."))
@prepare_command()
def pipeline_ae(ctx: click.Context, **kwargs):
"""
Expand Down Expand Up @@ -511,6 +516,7 @@ def pipeline_ae(ctx: click.Context, **kwargs):

config.ae = ConfigAutoEncoder()
config.ae.userid_column_name = kwargs["userid_column_name"]
config.ae.timestamp_column_name = kwargs["timestamp_column_name"]
config.ae.feature_scaler = kwargs["feature_scaler"]
config.ae.use_generic_model = kwargs["use_generic_model"]
config.ae.feature_columns = load_labels_file(kwargs["columns_file"])
Expand Down
6 changes: 3 additions & 3 deletions scripts/validation/val-run-pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ function run_pipeline_phishing_email(){
preprocess --vocab_hash_file=${MORPHEUS_ROOT}/morpheus/data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \
${INFERENCE_STAGE} \
monitor --description "Inference Rate" --smoothing=0.001 --unit inf \
add-class --label=pred --threshold=0.7 \
add-class --label=is_phishing --threshold=0.7 \
validate --val_file_name=${VAL_FILE} --results_file_name=${VAL_OUTPUT} --overwrite \
serialize \
to-file --filename=${OUTPUT_FILE} --overwrite
Expand All @@ -122,7 +122,7 @@ function run_pipeline_hammah_user123(){
VAL_OUTPUT=$5

morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \
pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="user123" --userid_column_name="userIdentitysessionContextsessionIssueruserName" \
pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="user123" --userid_column_name="userIdentitysessionContextsessionIssueruserName" --timestamp_column_name="event_dt" \
from-cloudtrail --input_glob="${MORPHEUS_ROOT}/models/datasets/validation-data/dfp-cloudtrail-*-input.csv" \
train-ae --train_data_glob="${MORPHEUS_ROOT}/models/datasets/training-data/dfp-cloudtrail-*.csv" --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage --seed 42 \
preprocess \
Expand All @@ -144,7 +144,7 @@ function run_pipeline_hammah_role-g(){
VAL_OUTPUT=$5

morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \
pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="role-g" --userid_column_name="userIdentitysessionContextsessionIssueruserName" \
pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="role-g" --userid_column_name="userIdentitysessionContextsessionIssueruserName" --timestamp_column_name="event_dt" \
from-cloudtrail --input_glob="${MORPHEUS_ROOT}/models/datasets/validation-data/dfp-cloudtrail-*-input.csv" \
train-ae --train_data_glob="${MORPHEUS_ROOT}/models/datasets/training-data/dfp-cloudtrail-*.csv" --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage --seed 42 \
preprocess \
Expand Down
2 changes: 1 addition & 1 deletion tests/benchmarks/test_bench_e2e_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def test_phishing_nlp_e2e(benchmark, tmp_path):
config.model_max_batch_size = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["model_max_batch_size"]
config.feature_length = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["feature_length"]
config.edge_buffer_size = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["edge_buffer_size"]
config.class_labels = ["score", "pred"]
config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
CppConfig.set_should_use_cpp(True)

input_filepath = E2E_TEST_CONFIGS["test_phishing_nlp_e2e"]["file_path"]
Expand Down
9 changes: 5 additions & 4 deletions tests/test_phishing.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from morpheus.stages.postprocess.validation_stage import ValidationStage
from morpheus.stages.preprocess.deserialize_stage import DeserializeStage
from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage
from morpheus.utils.file_utils import load_labels_file

# End-to-end test intended to imitate the Phishing validation test
FEATURE_LENGTH = 128
Expand Down Expand Up @@ -71,7 +72,7 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path):
mock_triton_client.async_infer.side_effect = async_infer

config.mode = PipelineModes.NLP
config.class_labels = ["score", "pred"]
config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
config.model_max_batch_size = MODEL_MAX_BATCH_SIZE
config.pipeline_batch_size = 1024
config.feature_length = FEATURE_LENGTH
Expand All @@ -96,7 +97,7 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path):
TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000',
force_convert_inputs=True))
pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7))
pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7))
pipe.add_stage(
ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05))
pipe.add_stage(SerializeStage(config))
Expand All @@ -112,7 +113,7 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path):
@pytest.mark.usefixtures("launch_mock_triton")
def test_email_cpp(config, tmp_path):
config.mode = PipelineModes.NLP
config.class_labels = ["score", "pred"]
config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
config.model_max_batch_size = MODEL_MAX_BATCH_SIZE
config.pipeline_batch_size = 1024
config.feature_length = FEATURE_LENGTH
Expand All @@ -139,7 +140,7 @@ def test_email_cpp(config, tmp_path):
server_url='localhost:8001',
force_convert_inputs=True))
pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7))
pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7))
pipe.add_stage(
ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05))
pipe.add_stage(SerializeStage(config))
Expand Down
9 changes: 5 additions & 4 deletions tests/test_phishing_kafka.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from morpheus.stages.preprocess.deserialize_stage import DeserializeStage
from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage
from morpheus.utils.compare_df import compare_df
from morpheus.utils.file_utils import load_labels_file

if (typing.TYPE_CHECKING):
from kafka import KafkaConsumer
Expand Down Expand Up @@ -87,7 +88,7 @@ def test_email_no_cpp(mock_triton_client: mock.MagicMock,
mock_triton_client.async_infer.side_effect = async_infer

config.mode = PipelineModes.NLP
config.class_labels = ["score", "pred"]
config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
config.model_max_batch_size = MODEL_MAX_BATCH_SIZE
config.pipeline_batch_size = 1024
config.feature_length = FEATURE_LENGTH
Expand Down Expand Up @@ -120,7 +121,7 @@ def test_email_no_cpp(mock_triton_client: mock.MagicMock,
TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000',
force_convert_inputs=True))
pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7))
pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7))
pipe.add_stage(SerializeStage(config))
pipe.add_stage(
WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic))
Expand Down Expand Up @@ -154,7 +155,7 @@ def test_email_cpp(dataset_pandas: DatasetManager,
kafka_topics: KafkaTopics,
kafka_consumer: "KafkaConsumer"):
config.mode = PipelineModes.NLP
config.class_labels = ["score", "pred"]
config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt"))
config.model_max_batch_size = MODEL_MAX_BATCH_SIZE
config.pipeline_batch_size = 1024
config.feature_length = FEATURE_LENGTH
Expand Down Expand Up @@ -187,7 +188,7 @@ def test_email_cpp(dataset_pandas: DatasetManager,
server_url='localhost:8001',
force_convert_inputs=True))
pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf"))
pipe.add_stage(AddClassificationsStage(config, labels=["pred"], threshold=0.7))
pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7))
pipe.add_stage(SerializeStage(config))
pipe.add_stage(
WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic))
Expand Down