nv-morpheus · May 27, 2022 · Apr 27, 2022 · Apr 27, 2022 · May 3, 2022 · May 16, 2022
@@ -1,4 +1,4 @@
 morpheus/_version.py export-subst
-data/* filter=lfs diff=lfs merge=lfs -text
+morpheus/data/* filter=lfs diff=lfs merge=lfs -text
 tests/expected_data/* filter=lfs diff=lfs merge=lfs -text
 tests/mock_triton_server/payloads/** filter=lfs diff=lfs merge=lfs -text
@@ -93,6 +93,7 @@ set(MORPHEUS_ROOT_PYTHON_FILES
   "setup.cfg"
   "setup.py"
   "versioneer.py"
+  "MANIFEST.in"
 )
 
 # Add the root python files to the list
@@ -134,6 +135,7 @@ install(
   FILES_MATCHING
     PATTERN "*.py"
     PATTERN "py.typed"
+    PATTERN "data/*"
 )
 
 

@@ -1,2 +1,4 @@
 include versioneer.py
-include morpheus/_version.py
+include morpheus/_version.py
+recursive-include morpheus/data *
+recursive-include morpheus *.so py.typed *.pyi
@@ -35,15 +35,15 @@ This example will copy the values from Kafka into ``out.jsonlines``.
 Remove Fields from JSON Objects
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-This example will only copy the fiels 'timestamp', 'src_ip' and 'dest_ip' from ``data/pcap_dump.jsonlines`` to
+This example will only copy the fields 'timestamp', 'src_ip' and 'dest_ip' from ``morpheus/data/pcap_dump.jsonlines`` to
 ``out.jsonlines``.
 
 .. image:: img/remove_fields_from_json_objects.png
 
 .. code-block:: bash
 
    morpheus run pipeline-nlp --viz_file=basic_usage_img/remove_fields_from_json_objects.png \
-      from-file --filename data/pcap_dump.jsonlines \
+      from-file --filename morpheus/data/pcap_dump.jsonlines \
       deserialize \
       serialize --include 'timestamp' --include 'src_ip' --include 'dest_ip' \
       to-file --filename out.jsonlines
@@ -58,7 +58,7 @@ This example will report the throughput on the command line.
 .. code-block:: console
 
    $ morpheus run pipeline-nlp --viz_file=basic_usage_img/monitor_throughput.png  \
-      from-file --filename data/pcap_dump.jsonlines \
+      from-file --filename morpheus/data/pcap_dump.jsonlines \
       deserialize \
       monitor --description "Lines Throughput" --smoothing 0.1 --unit "lines" \
       serialize \
@@ -79,7 +79,7 @@ decouple one stage from the next. Without the buffers, all montioring would show
 .. code-block:: console
 
    $ morpheus run pipeline-nlp --viz_file=basic_usage_img/multi_monitor_throughput.png  \
-      from-file --filename data/pcap_dump.jsonlines \
+      from-file --filename morpheus/data/pcap_dump.jsonlines \
       monitor --description "From File Throughput" \
       buffer \
       deserialize \
@@ -107,7 +107,7 @@ This example shows an NLP Pipeline which uses most stages available in Morpheus.
 
    $ morpheus run --num_threads=8 --pipeline_batch_size=1024 --model_max_batch_size=32 \
       pipeline-nlp --viz_file=basic_usage_img/nlp_kitchen_sink.png  \
-      from-file --filename data/pcap_dump.jsonlines \
+      from-file --filename morpheus/data/pcap_dump.jsonlines \
       buffer --count=500 \
       deserialize \
       preprocess \

@@ -185,12 +185,13 @@ From this information, we can see that the expected shape of the model inputs is
 Let's set up the paths for our input and output files. For simplicity, we assume that the `MORPHEUS_ROOT` environment variable is set to the root of the Morpheus project repository. In a production deployment, it may be more prudent to replace our usage of environment variables with command-line flags or a dedicated configuration management library.
 
 ```python
+import morpheus
+
 root_dir = os.environ['MORPHEUS_ROOT']
 out_dir = os.environ.get('OUT_DIR', '/tmp')
 
-data_dir = os.path.join(root_dir, 'data')
-labels_file = os.path.join(data_dir, 'labels_phishing.txt')
-vocab_file = os.path.join(data_dir, 'bert-base-uncased-hash.txt')
+labels_file = os.path.join(morpheus.DATA_DIR, 'labels_phishing.txt')
+vocab_file = os.path.join(morpheus.DATA_DIR, 'bert-base-uncased-hash.txt')
 
 input_file = os.path.join(root_dir, 'examples/data/email.jsonlines')
 results_file = os.path.join(out_dir, 'detections.jsonlines')
@@ -213,9 +214,9 @@ First we set our pipeline mode to NLP. Next, we use the third-party [psutils](ht
 
 The `feature_length` property needs to match the length of the model inputs, which we got from Triton in the previous section using the model's `/config` endpoint.
 
-Ground truth classification labels are read from the `data/labels_phishing.txt` file included in Morpheus.
+Ground truth classification labels are read from the `morpheus/data/labels_phishing.txt` file included in Morpheus.
 
-Now that our config object is populated, we move on to the pipeline itself. We will be using the same input file from the previous examples, and to tokenize the input data we will use Morpheus' `PreprocessNLPStage`.  
+Now that our config object is populated, we move on to the pipeline itself. We will be using the same input file from the previous examples, and to tokenize the input data we will use Morpheus' `PreprocessNLPStage`.
 
 This stage uses the [cudf subword tokenizer](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.core.subword_tokenizer.SubwordTokenizer.__call__.html) to transform strings into a tensor of numbers to be fed into the neural network model. Rather than split the string by characters or whitespaces, we split them into meaningful subwords based upon the occurrence of the subwords in a large training corpus. You can find more details here: [https://arxiv.org/abs/1810.04805v2](https://arxiv.org/abs/1810.04805v2). All we need to know for now is that the text will be converted to subword token ids based on the vocabulary file that we provide (`vocab_hash_file=vocab file`).
 
@@ -232,7 +233,7 @@ pipeline.add_stage(
 ```
 
 In addition to providing the `Config` object that we defined above, we also configure this stage to:
-* Use the `data/bert-base-uncased-hash.txt` vocabulary file for its subword token ids (`vocab_hash_file=vocab_file`).
+* Use the `morpheus/data/bert-base-uncased-hash.txt` vocabulary file for its subword token ids (`vocab_hash_file=vocab_file`).
 * Truncate the length of the text to a max number of tokens (`truncation=True`).
 * Change the casing to all lowercase (`do_lower_case=True`).
 * Refrain from adding the default BERT special tokens like `[SEP]` for separation between two sentences and `[CLS]` at the start of the text (`add_special_tokens=False`).
@@ -258,7 +259,7 @@ pipeline.add_stage(MonitorStage(config, description="Inference Rate", smoothing=
 pipeline.add_stage(FilterDetectionsStage(config, threshold=0.9))
 ```
 
-Lastly, we will save our results to disk. For this purpose, we are using two stages that are often used in conjunction with each other: `SerializeStage` and `WriteToFileStage`. 
+Lastly, we will save our results to disk. For this purpose, we are using two stages that are often used in conjunction with each other: `SerializeStage` and `WriteToFileStage`.
 
 The `SerializeStage` is used to include and exclude columns as desired in the output. Importantly, it also handles conversion from the `MultiMessage`-derived output type that is used by the `FilterDetectionsStage` to the `MessageMeta` class that is expected as input by the `WriteToFileStage`.
 
@@ -285,6 +286,7 @@ import os
 
 import psutil
 
+import morpheus
 from morpheus.config import Config
 from morpheus.config import PipelineModes
 from morpheus.pipeline import LinearPipeline
@@ -308,9 +310,8 @@ def run_pipeline():
     root_dir = os.environ['MORPHEUS_ROOT']
     out_dir = os.environ.get('OUT_DIR', '/tmp')
 
-    data_dir = os.path.join(root_dir, 'data')
-    labels_file = os.path.join(data_dir, 'labels_phishing.txt')
-    vocab_file = os.path.join(data_dir, 'bert-base-uncased-hash.txt')
+    labels_file = os.path.join(morpheus.DATA_DIR, 'labels_phishing.txt')
+    vocab_file = os.path.join(morpheus.DATA_DIR, 'bert-base-uncased-hash.txt')
 
     input_file = os.path.join(root_dir, 'examples/data/email.jsonlines')
     results_file = os.path.join(out_dir, 'detections.jsonlines')
@@ -377,7 +378,7 @@ if __name__ == "__main__":
 
 In our previous examples, we didn't define a constructor for the Python classes that we were building for our stages. However, there are many cases where we will need to receive configuration parameters. Every stage constructor must receive an instance of a `morpheus.config.Config` object as its first argument and is then free to define additional stage-specific arguments after that. The Morpheus config object will contain configuration parameters needed by multiple stages in the pipeline, and the constructor in each Morpheus stage is free to inspect these. In contrast, parameters specific to a single stage are typically defined as constructor arguments.
 
-Note that it is a best practice to perform any necessary validation checks in the constructor. This allows us to fail early rather than after the pipeline has started. 
+Note that it is a best practice to perform any necessary validation checks in the constructor. This allows us to fail early rather than after the pipeline has started.
 
 In our `RecipientFeaturesStage` example, we hard-coded the Bert separator token. Let's instead refactor the code to receive that as a constructor argument. Let's also take the opportunity to verify that the pipeline mode is set to `morpheus.config.PipelineModes.NLP`. Our refactored class definition now looks like:
 

@@ -558,11 +558,11 @@ $ helm install --set ngc.apiKey="$API_KEY" \
       --use_cpp=True \
       pipeline-nlp \
         --model_seq_length=128 \
-        --labels_file=./data/labels_phishing.txt \
-        from-file --filename=./data/email.jsonlines \
+        --labels_file=./morpheus/data/labels_phishing.txt \
+        from-file --filename=./morpheus/data/email.jsonlines \
         monitor --description 'FromFile Rate' --smoothing=0.001 \
         deserialize \
-        preprocess --vocab_hash_file=./data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \
+        preprocess --vocab_hash_file=./morpheus/data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \
         monitor --description 'Preprocess Rate' \
         inf-triton --model_name=phishing-bert-onnx --server_url=ai-engine:8001 --force_convert_inputs=True \
         monitor --description 'Inference Rate' --smoothing=0.001 --unit inf \
@@ -588,11 +588,11 @@ $ helm install --set ngc.apiKey="$API_KEY" \
       --use_cpp=True \
       pipeline-nlp \
         --model_seq_length=128 \
-        --labels_file=./data/labels_phishing.txt \
+        --labels_file=./morpheus/data/labels_phishing.txt \
         from-kafka --input_topic <YOUR_INPUT_KAFKA_TOPIC> --bootstrap_servers broker:9092 \
         monitor --description 'FromKafka Rate' --smoothing=0.001 \
         deserialize \
-        preprocess --vocab_hash_file=./data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \
+        preprocess --vocab_hash_file=./morpheus/data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \
         monitor --description 'Preprocess Rate' \
         inf-triton --force_convert_inputs=True --model_name=phishing-bert-onnx --server_url=ai-engine:8001 \
         monitor --description='Inference Rate' --smoothing=0.001 --unit inf \
@@ -635,10 +635,10 @@ $ helm install --set ngc.apiKey="$API_KEY" \
       --model_max_batch_size=32 \
       pipeline-nlp \
         --model_seq_length=256 \
-        from-file --filename=./data/pcap_dump.jsonlines \
+        from-file --filename=./morpheus/data/pcap_dump.jsonlines \
         monitor --description 'FromFile Rate' --smoothing=0.001 \
         deserialize \
-        preprocess --vocab_hash_file=./data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \
+        preprocess --vocab_hash_file=./morpheus/data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \
         monitor --description='Preprocessing rate' \
         inf-triton --force_convert_inputs=True --model_name=sid-minibert-onnx --server_url=ai-engine:8001 \
         monitor --description='Inference rate' --smoothing=0.001 --unit inf \
@@ -667,7 +667,7 @@ $ helm install --set ngc.apiKey="$API_KEY" \
           from-kafka --input_topic <YOUR_INPUT_KAFKA_TOPIC> --bootstrap_servers broker:9092 \
           monitor --description 'FromKafka Rate' --smoothing=0.001 \
           deserialize \
-          preprocess --vocab_hash_file=./data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \
+          preprocess --vocab_hash_file=./morpheus/data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \
           monitor --description='Preprocessing Rate' \
           inf-triton --force_convert_inputs=True --model_name=sid-minibert-onnx --server_url=ai-engine:8001 \
           monitor --description='Inference Rate' --smoothing=0.001 --unit inf \
@@ -685,7 +685,7 @@ Make sure you create input and output Kafka topics before you start the pipeline
 $ kubectl -n $NAMESPACE exec -it deploy/broker -c broker -- kafka-console-producer.sh \
        --broker-list broker:9092 \
        --topic <YOUR_INPUT_KAFKA_TOPIC> < \
-       <YOUR_INPUT_DATA_FILE_PATH_EXAMPLE: ${HOME}/data/pcap_dump.jsonlines>
+       <YOUR_INPUT_DATA_FILE_PATH_EXAMPLE: ${HOME}/morpheus/data/pcap_dump.jsonlines>
 ```
 
 **Note**: This should be used for development purposes only via this developer kit. Loading from the file into Kafka should not be used in production deployments of Morpheus.
@@ -708,7 +708,7 @@ $ helm install --set ngc.apiKey="$API_KEY" \
         --model_max_batch_size=64 \
         --use_cpp=True \
         pipeline-fil \
-          from-file --filename=./data/nvsmi.jsonlines \
+          from-file --filename=./morpheus/data/nvsmi.jsonlines \
           monitor --description 'FromFile Rate' --smoothing=0.001 \
           deserialize \
           preprocess \
@@ -754,7 +754,7 @@ Make sure you create input and output Kafka topics before you start the pipeline
 $ kubectl -n $NAMESPACE exec -it deploy/broker -c broker -- kafka-console-producer.sh \
        --broker-list broker:9092 \
        --topic <YOUR_INPUT_KAFKA_TOPIC> < \
-       <YOUR_INPUT_DATA_FILE_PATH_EXAMPLE: ${HOME}/data/nvsmi.jsonlines>
+       <YOUR_INPUT_DATA_FILE_PATH_EXAMPLE: ${HOME}/morpheus/data/nvsmi.jsonlines>
 ```
 
 **Note**: This should be used for development purposes only via this developer kit. Loading from the file into Kafka should not be used in production deployments of Morpheus.
@@ -937,7 +937,7 @@ Options:
                                   order to convert class IDs into labels. A
                                   label file is a simple text file where each
                                   line corresponds to a label  [default:
-                                  data/labels_nlp.txt]
+                                  morpheus/data/labels_nlp.txt]
   --viz_file FILE                 Save a visualization of the pipeline at the
                                   specified location
   --help                          Show this message and exit.
@@ -1000,7 +1000,7 @@ Options:
                                   only a single output label is created for
                                   FIL
   --columns_file FILE             Specifies a file to read column features.
-                                  [default: data/columns_fil.txt]
+                                  [default: morpheus/data/columns_fil.txt]
   --viz_file FILE                 Save a visualization of the pipeline at the
                                   specified location
   --help                          Show this message and exit.
@@ -1052,7 +1052,7 @@ Usage: morpheus run pipeline-ae [OPTIONS] COMMAND1 [ARGS]... [COMMAND2
   4. The following stages must come after an inference stage: `add-class`, `filter`, `gen-viz`
 
 Options:
-  --columns_file FILE        [default: data/columns_ae.txt]
+  --columns_file FILE        [default: morpheus/data/columns_ae.txt]
   --labels_file FILE         Specifies a file to read labels from in order to
                              convert class IDs into labels. A label file is a
                              simple text file where each line corresponds to a

@@ -46,7 +46,7 @@ $ nvidia-smi dmon
 
 Each line in the output represents the GPU metrics at a single point in time. As the tool progresses the GPU begins to be utilized and you can see the SM% and Mem% increase as memory is loaded into the GPU and computations are performed. The model we will be using can ingest this information and determine whether or not the GPU is mining cryptocurriences without needing additional information from the host machine.
 
-In this example we will be using the `data/nvsmi.jsonlines` dataset that is known to contain mining behavior profiles. The dataset is in the `.jsonlines` format which means each new line represents an new JSON object. In order to parse this data, it must be ingested, split by lines into individual JSON objects, and parsed into cuDF dataframes. This will all be handled by Morpheus.
+In this example we will be using the `morpheus/data/nvsmi.jsonlines` dataset that is known to contain mining behavior profiles. The dataset is in the `.jsonlines` format which means each new line represents an new JSON object. In order to parse this data, it must be ingested, split by lines into individual JSON objects, and parsed into cuDF dataframes. This will all be handled by Morpheus.
 
 ## Pipeline Architecture
 
@@ -100,9 +100,9 @@ morpheus --log_level=DEBUG \
    `# Run a pipeline with 8 threads and a model batch size of 32 (Must be equal or less than Triton config)` \
    run --num_threads=8 --pipeline_batch_size=1024 --model_max_batch_size=1024 \
    `# Specify a NLP pipeline with 256 sequence length (Must match Triton config)` \
-   pipeline-fil --columns_file=$MORPHEUS_ROOT/data/columns_fil.txt \
+   pipeline-fil \
    `# 1st Stage: Read from file` \
-   from-file --filename=$MORPHEUS_ROOT/data/nvsmi.jsonlines \
+   from-file --filename=$MORPHEUS_ROOT/morpheus/data/nvsmi.jsonlines \
    `# 2nd Stage: Deserialize from JSON strings to objects` \
    deserialize \
    `# 3rd Stage: Preprocessing converts the input data into BERT tokens` \

@@ -83,11 +83,11 @@ Options:
   --help                          Show this message and exit.
 ```
 
-To launch the configured Morpheus pipeline with the sample data that is provided at `<MORPHEUS_ROOT>/data`, run the following:
+To launch the configured Morpheus pipeline with the sample data that is provided at `<MORPHEUS_ROOT>/morpheus/data`, run the following:
 
 ```bash
 python run.py \
-	--input_file ../../data/abp_pcap_dump.jsonlines \
+	--input_file ../../morpheus/data/abp_pcap_dump.jsonlines \
 	--output_file ./pcap_out.jsonlines \
 	--model_name 'abp-pcap-xgb' \
 	--server_url localhost:8001

@@ -27,7 +27,7 @@
 from morpheus.messages import MultiInferenceFILMessage
 from morpheus.messages import MultiInferenceMessage
 from morpheus.messages import MultiMessage
-from morpheus.stages.preprocess.preprocessing import PreprocessBaseStage
+from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage
 
 
 class AbpPcapPreprocessingStage(PreprocessBaseStage):

@@ -57,7 +57,7 @@ To launch the configured Morpheus pipeline with the sample data that is provided
 
 ```bash
 
-python run.py 
+python run.py
 ====Building Pipeline====
 Added source: <from-file-0; FileSourceStage(filename=validation.csv, iterative=None, file_type=auto, repeat=1, filter_null=False, cudf_kwargs=None)>
   └─> morpheus.MessageMeta

@@ -13,9 +13,9 @@
 # limitations under the License.
 
 import logging
+import os
 
 import click
-import psutil
 
 from morpheus.config import Config
 from morpheus.config import CppConfig
@@ -35,7 +35,7 @@
 @click.command()
 @click.option(
     "--num_threads",
-    default=psutil.cpu_count(),
+    default=os.cpu_count(),
     type=click.IntRange(min=1),
     help="Number of internal pipeline threads to use",
 )