Refactor converter for better maintainability and readability

mlcommons · Jul 16, 2024 · 48de9fa · 48de9fa
1 parent a6590fe
commit 48de9fa
Show file tree

Hide file tree

Showing 6 changed files with 501 additions and 539 deletions.
diff --git a/USER_GUIDE.md b/USER_GUIDE.md
@@ -54,13 +54,16 @@ $ chakra_trace_link \
 ```
 
 ### Execution Trace Converter (chakra_converter)
-Converts the merged execution traces into the Chakra schema.
+Converts the execution traces from `chakra_trace_link` into traces in the protobuf format. It is responsible for identifying and encoding dependencies for simulation as well. The converter is designed for any downstream simulators that take Chakra execution traces in the protobuf format. It takes an input file in another format and generates a Chakra execution trace output in the protobuf format.
 ```bash
-$ chakra_converter \
-    --input_filename /path/to/chakra_host_device_trace.json \
-    --output_filename /path/to/chakra_trace \
-    --input_type <input_type>
+$ chakra_converter PyTorch \
+    --input /path/to/chakra_host_device_trace.json \
+    --output /path/to/chakra_trace \
+    [--simulate] \
 ```
+* --input: Path to the input file containing the merged Chakra host and device traces in JSON format.
+* --output: Path to the output file where the converted Chakra trace will be saved in protobuf format.
+* --simulate: (Optional) Enable simulation of operators after the conversion for validation and debugging purposes. This option allows simulation of traces without running them through a simulator. Users can validate the converter or simulator against actual measured values using tools like chrome://tracing or https://perfetto.dev/. Read the duration of the timeline and compare the total execution time against the final simulation time of a trace. Disabled by default because it takes a long time.
 
 ### Execution Trace Feeder (et_feeder)
 The Execution Trace Feeder (et_feeder) is a C++ library designed to feed Chakra traces into any compatible C++ simulator. This library specifically provides dependency-free nodes to a simulator, which must import the feeder as a library. Currently, ASTRA-sim is the only simulator that supports this trace feeder. Below are the commands to run execution traces on ASTRA-sim:

diff --git a/src/converter/converter.py b/src/converter/converter.py
@@ -1,13 +1,12 @@
 import argparse
 import logging
-import sys
-import traceback
 
 from .pytorch_converter import PyTorchConverter
 from .text_converter import TextConverter
 
 
 def setup_logging(log_filename: str) -> None:
+    """Set up logging to file and stream handlers."""
     formatter = logging.Formatter("%(levelname)s [%(asctime)s] %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p")
 
     file_handler = logging.FileHandler(log_filename, mode="w")
@@ -21,46 +20,95 @@ def setup_logging(log_filename: str) -> None:
     logging.basicConfig(level=logging.DEBUG, handlers=[file_handler, stream_handler])
 
 
+def convert_text(args: argparse.Namespace) -> None:
+    """Convert text input trace to Chakra execution trace."""
+    converter = TextConverter(args.input, args.output, args.num_npus, args.num_passes)
+    converter.convert()
+
+
+def convert_pytorch(args: argparse.Namespace) -> None:
+    """Convert PyTorch input trace to Chakra execution trace."""
+    converter = PyTorchConverter()
+    converter.convert(args.input, args.output, args.simulate)
+
+
 def main() -> None:
-    parser = argparse.ArgumentParser(description="Execution Trace Converter")
-    parser.add_argument("--input_type", type=str, default=None, required=True, help="Input execution trace type")
-    parser.add_argument(
-        "--input_filename", type=str, default=None, required=True, help="Input execution trace filename"
+    """Convert to Chakra execution trace in the protobuf format."""
+    parser = argparse.ArgumentParser(
+        description=(
+            "Chakra execution trace converter for simulators. This converter is designed for any downstream "
+            "simulators that take Chakra execution traces in the protobuf format. This converter takes an input file "
+            "in another format and generates a Chakra execution trace output in the protobuf format."
+        )
+    )
+
+    parser.add_argument("--log-filename", type=str, default="debug.log", help="Log filename")
+
+    subparsers = parser.add_subparsers(title="subcommands", description="Valid subcommands", help="Input type")
+
+    pytorch_parser = subparsers.add_parser(
+        "PyTorch",
+        help="Convert Chakra host + device execution trace in JSON to Chakra host + device execution trace in the "
+        "Chakra schema with protobuf format",
     )
-    parser.add_argument(
-        "--output_filename", type=str, default=None, required=True, help="Output Chakra execution trace filename"
+    pytorch_parser.add_argument(
+        "--input", type=str, required=True, help="Input Chakra host + device traces in the JSON format"
     )
-    parser.add_argument(
-        "--num_npus", type=int, default=None, required="Text" in sys.argv, help="Number of NPUs in a system"
+    pytorch_parser.add_argument(
+        "--output", type=str, required=True, help="Output Chakra host + device traces in the protobuf format"
     )
-    parser.add_argument(
-        "--num_passes", type=int, default=None, required="Text" in sys.argv, help="Number of training passes"
+    pytorch_parser.add_argument(
+        "--simulate",
+        action="store_true",
+        help=(
+            "Enable simulation of operators after the conversion for validation and debugging purposes. This option "
+            "allows simulation of traces without running them through a simulator. Users can validate the converter "
+            "or simulator against actual measured values using tools like chrome://tracing or https://perfetto.dev/. "
+            "Read the duration of the timeline and compare the total execution time against the final simulation time "
+            "of a trace. Disabled by default because it takes a long time."
+        ),
     )
-    parser.add_argument("--simulate", action="store_true", help="Run simulate_execution if set")
-    parser.add_argument("--log_filename", type=str, default="debug.log", help="Log filename")
+    pytorch_parser.set_defaults(func=convert_pytorch)
+
+    text_parser = subparsers.add_parser(
+        "Text", help="Convert text-based model description to Chakra schema-based traces in the protobuf format"
+    )
+    text_parser.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help=(
+            "Input file in the text format that describes a model. This follows the text format used in ASTRA-sim: "
+            "https://github.com/astra-sim/astra-sim"
+        ),
+    )
+    text_parser.add_argument(
+        "--output", type=str, required=True, help="Output Chakra execution trace filename in the protobuf format"
+    )
+    text_parser.add_argument(
+        "--num-npus",
+        type=int,
+        required=True,
+        help="Number of NPUs in a system. Determines the number of traces the converter generates",
+    )
+    text_parser.add_argument(
+        "--num-passes",
+        type=int,
+        required=True,
+        help=(
+            "Number of loops when generating traces based on the text input file. Increasing the number of passes "
+            "increases the number of training iterations for a given text input."
+        ),
+    )
+    text_parser.set_defaults(func=convert_text)
+
     args = parser.parse_args()
 
-    setup_logging(args.log_filename)
-    logging.debug(" ".join(sys.argv))
-
-    try:
-        if args.input_type == "Text":
-            converter = TextConverter(args.input_filename, args.output_filename, args.num_npus, args.num_passes)
-            converter.convert()
-        elif args.input_type == "PyTorch":
-            converter = PyTorchConverter(args.input_filename, args.output_filename, simulate=args.simulate)
-            converter.convert()
-        else:
-            supported_types = ["Text", "PyTorch"]
-            logging.error(
-                f"The input type '{args.input_type}' is not supported. "
-                f"Supported types are: {', '.join(supported_types)}."
-            )
-            sys.exit(1)
-    except Exception:
-        traceback.print_exc()
-        logging.debug(traceback.format_exc())
-        sys.exit(1)
+    if "func" in args:
+        setup_logging(args.log_filename)
+        args.func(args)
+    else:
+        parser.print_help()
 
 
 if __name__ == "__main__":