Skip to content

Commit

Permalink
Replace usage of PreprocessLogParsingStage with PreprocessNLPStage (#842
Browse files Browse the repository at this point in the history
)

* Code in `PreprocessLogParsingStage` was about 99% the same as `PreprocessNLPStage`
* The only thing `PreprocessLogParsingStage` provided was different default constructor values, along with some special handling of punctuation. However the cudf subword_tokenizer removes all punctuation.

fixes #801

Authors:
  - David Gardner (https://github.com/dagardner-nv)

Approvers:
  - Michael Demoret (https://github.com/mdemoret-nv)

URL: #842
  • Loading branch information
dagardner-nv authored Apr 10, 2023
1 parent 3fc48c0 commit bfcee5f
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 202 deletions.
3 changes: 1 addition & 2 deletions examples/log_parsing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,13 @@ From the root of the Morpheus repo run:
```bash
PYTHONPATH="examples/log_parsing" \
morpheus --log_level INFO \
--plugin "preprocessing" \
--plugin "inference" \
--plugin "postprocessing" \
run --num_threads 1 --use_cpp False --pipeline_batch_size 1024 --model_max_batch_size 32 \
pipeline-nlp \
from-file --filename ./models/datasets/validation-data/log-parsing-validation-data-input.csv \
deserialize \
log-preprocess --vocab_hash_file ./models/training-tuning-scripts/sid-models/resources/bert-base-cased-hash.txt --stride 64 \
preprocess --vocab_hash_file ./models/training-tuning-scripts/sid-models/resources/bert-base-cased-hash.txt --stride 64 --column=raw \
monitor --description "Preprocessing rate" \
inf-logparsing --model_name log-parsing-onnx --server_url localhost:8001 --force_convert_inputs=True \
monitor --description "Inference rate" --unit inf \
Expand Down
190 changes: 0 additions & 190 deletions examples/log_parsing/preprocessing.py

This file was deleted.

15 changes: 8 additions & 7 deletions examples/log_parsing/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import click
from inference import LogParsingInferenceStage
from postprocessing import LogParsingPostProcessingStage
from preprocessing import PreprocessLogParsingStage

from morpheus.config import Config
from morpheus.config import CppConfig
Expand All @@ -27,6 +26,7 @@
from morpheus.stages.input.file_source_stage import FileSourceStage
from morpheus.stages.output.write_to_file_stage import WriteToFileStage
from morpheus.stages.preprocess.deserialize_stage import DeserializeStage
from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage


@click.command()
Expand Down Expand Up @@ -115,12 +115,13 @@ def run_pipeline(
# Add a preprocessing stage.
# This stage preprocess the rows in the Dataframe.
pipeline.add_stage(
PreprocessLogParsingStage(config,
vocab_hash_file=model_vocab_hash_file,
truncation=False,
do_lower_case=False,
stride=64,
add_special_tokens=False))
PreprocessNLPStage(config,
vocab_hash_file=model_vocab_hash_file,
truncation=False,
do_lower_case=False,
stride=64,
add_special_tokens=False,
column="raw"))

# Add a monitor stage.
# This stage logs the metrics (msg/sec) from the above stage.
Expand Down
3 changes: 0 additions & 3 deletions morpheus/stages/preprocess/preprocess_nlp_stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import mrc

import cudf
from cudf.core.subword_tokenizer import SubwordTokenizer

import morpheus._lib.stages as _stages
from morpheus.cli.register_stage import register_stage
Expand Down Expand Up @@ -94,8 +93,6 @@ def __init__(self,
self._do_lower_case = do_lower_case
self._add_special_tokens = add_special_tokens

self._tokenizer: SubwordTokenizer = None

@property
def name(self) -> str:
return "preprocess-nlp"
Expand Down

0 comments on commit bfcee5f

Please sign in to comment.