Holoscan Flow Benchmarking Logging Updates (#555)

* Benchmark: Use `logging` module in Holoscan Flow Benchmarking script Changes: - Adopts standard `logging` module to enhance granular control and redirection of benchmark script logging - Updates missing log files from warning to error severity Signed-off-by: Tom Birdsong <tbirdsong@nvidia.com> * Benchmark: Fix path check for missing flow tracking log files Refactors code and addresses issue where log file parent directory was not appended to path search for existence, resulting in only the local directory being checked for log files existence. NVBug 4933187 Signed-off-by: Tom Birdsong <tbirdsong@nvidia.com> * Update CLI and address review comments Changes: - Add `--level` option to control logging verbosity and use `logger` object throughout - Refactor logging so that each output filepath is printed exactly once. Available files are printed with INFO verbosity, missing files are printed with ERROR verbosity. - Exit with error if log files are missing Sample log printout: ``` 2024-11-08 05:23:47,323 INFO: Log file directory: /workspace/holohub/log_directory_20241108-052346 2024-11-08 05:23:47,324 INFO: Log files are available: 2024-11-08 05:23:47,324 ERROR: Log files are missing: /workspace/holohub/log_directory_20241108-052346/logger_greedy_1_1.log 2024-11-08 05:23:47,324 ERROR: Some log files are missing. Please check the log directory. ``` Signed-off-by: Tom Birdsong <tbirdsong@nvidia.com> * Infra: Propagate `run launch` command errors Updates `run` script to propagate errors from `run launch` as the `run` script exit code. Allows Holoscan Flow Benchmarking tools to detect when a HoloHub application process exits abnormally. Signed-off-by: Tom Birdsong <tbirdsong@nvidia.com> * Benchmark: Add file handler and process debug output Changes: - Adds logging handler to log detailed (DEBUG) output to a "benchmark.log" file in the given run log directory. Adds a stream handler to preserve control over CLI output verbosity with the "--level" CLI argument. - Logs stdout and stderr from an failing process to aid error investigation. Includes thread ID. Signed-off-by: Tom Birdsong <tbirdsong@nvidia.com> --------- Signed-off-by: Tom Birdsong <tbirdsong@nvidia.com>
nvidia-holoscan · Nov 12, 2024 · e248dd4 · e248dd4
1 parent 25b6b63
commit e248dd4
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 23 deletions.
diff --git a/benchmarks/holoscan_flow_benchmarking/benchmark.py b/benchmarks/holoscan_flow_benchmarking/benchmark.py
@@ -14,21 +14,26 @@
 # limitations under the License.
 
 import argparse
+import logging
 import os
 import subprocess
 import sys
 import threading
 import time
+from collections import defaultdict
 
 from nvitop import Device
 
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.DEBUG)
+
 # global variables for GPU monitoring
 stop_gpu_monitoring = False
 stop_gpu_monitoring_lock = threading.Lock()
 
 
 def monitor_gpu(gpu_uuids, filename):
-    print("Monitoring GPU utilization in a separate thread")
+    logger.info("Monitoring GPU utilization in a separate thread")
     devices = []
     if gpu_uuids == "all" or gpu_uuids == "" or gpu_uuids is None:
         devices = Device.all()
@@ -67,13 +72,16 @@ def run_command(app_launch_command, env):
             stderr=subprocess.PIPE,
         )
     except subprocess.CalledProcessError as e:
-        print(f"Error running command: {e}")
-        print(f"stdout:\n{e.stdout.decode('utf-8')}")
-        print(f"stderr:\n{e.stderr.decode('utf-8')}")
+        logger.error(f"Error running command: {e}")
+        logger.error(f"stdout:\n{e.stdout.decode('utf-8')}")
+        logger.error(f"stderr:\n{e.stderr.decode('utf-8')}")
         sys.exit(1)
+
+    logger.debug(f"stdout:\n{result.stdout}")
+    logger.debug(f"stderr:\n{result.stderr}")
     if result.returncode != 0:
         # The command returned an error
-        print(f"Error: Command {app_launch_command} exited with code {result.returncode}")
+        logger.error(f'Command "{app_launch_command}" exited with code {result.returncode}')
         sys.exit(1)
 
 
@@ -178,6 +186,7 @@ def main():
     parser.add_argument(
         "-u", "--monitor_gpu", action="store_true", help="enable this to monitor GPU utilization"
     )
+    parser.add_argument("--level", type=str, default="INFO", help="Logging verbosity level")
 
     args = parser.parse_args()
 
@@ -186,8 +195,8 @@ def main():
         and "eventbased" not in args.sched
         and args.num_worker_threads != 1
     ):
-        print(
-            "Warning: num_worker_threads is ignored as multithread or eventbased scheduler is not used"
+        logger.warning(
+            "num_worker_threads is ignored as multithread or eventbased scheduler is not used"
         )
 
     log_directory = None
@@ -200,12 +209,26 @@ def main():
         # check if the given directory is valid or not
         log_directory = args.log_directory
         if not os.path.isdir(log_directory):
-            print(
+            logger.info(
                 "Log directory is not found. Creating a new directory at",
                 os.path.abspath(log_directory),
             )
             os.mkdir(os.path.abspath(log_directory))
 
+    # Set up detailed logging to file + custom verbosity logging to console
+    fileHandler = logging.FileHandler(os.path.join(log_directory, "benchmark.log"))
+    fileHandler.setLevel(level=logging.DEBUG)
+    fileHandler.setFormatter(
+        logging.Formatter("%(asctime)s %(threadName)s %(levelname)s: %(message)s")
+    )
+    logger.addHandler(fileHandler)
+
+    consoleHandler = logging.StreamHandler(stream=sys.stdout)
+    consoleHandler.setLevel(level=args.level.upper())
+    consoleHandler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s: %(message)s"))
+    logger.root.removeHandler(logger.root.handlers[0])
+    logger.addHandler(consoleHandler)
+
     env = os.environ.copy()
     if args.gpu != "all":
         env["CUDA_VISIBLE_DEVICES"] = args.gpu
@@ -228,12 +251,11 @@ def main():
             env["HOLOSCAN_SCHEDULER"] = scheduler
             env["HOLOSCAN_EVENTBASED_WORKER_THREADS"] = str(args.num_worker_threads)
         elif scheduler != "greedy":
-            print("Unsupported scheduler ", scheduler)
+            logger.error("Unsupported scheduler ", scheduler)
             sys.exit(1)
         # No need to set the scheduler for greedy scheduler
-
         for i in range(1, args.runs + 1):
-            print(f"Run {i} started for {scheduler} scheduler.")
+            logger.info(f"Run {i} started for {scheduler} scheduler.")
             instance_threads = []
             if args.monitor_gpu:
                 gpu_utilization_logfile_name = (
@@ -272,19 +294,34 @@ def main():
                 stop_gpu_monitoring_lock.release()
                 gpu_monitoring_thread.join()
                 gpu_utilization_log_files.append(gpu_utilization_logfile_name)
-            print(f"Run {i} completed for {scheduler} scheduler.")
+            logger.info(f"Run {i} completed for {scheduler} scheduler.")
             time.sleep(1)  # cool down period
+    logger.info("****************************************************************")
+    logger.info("Evaluation completed.")
+    logger.info("****************************************************************")
+
+    logger.info(f"Log file directory: {os.path.abspath(log_directory)}")
+    log_info = defaultdict(lambda: defaultdict(list))
+    log_file_sets = [("log", log_files)] + (
+        [("gpu", gpu_utilization_log_files)] if args.monitor_gpu else []
+    )
+    for log_type, log_list in log_file_sets:
+        abs_filepaths = [
+            os.path.abspath(os.path.join(log_directory, log_file)) for log_file in log_list
+        ]
+        log_info[log_type]["found"] = [file for file in abs_filepaths if os.path.exists(file)]
+        log_info[log_type]["missing"] = [file for file in abs_filepaths if not os.path.exists(file)]
+        logger.info(
+            f'{log_type.capitalize()} files are available: {", ".join(log_info[log_type]["found"])}'
+        )
+        if log_info[log_type]["missing"]:
+            logger.error(
+                f'{log_type.capitalize()} files are missing: {", ".join(log_info[log_type]["missing"])}'
+            )
 
-    # Just print comma-separate values of log_files
-    print("\nEvaluation completed.")
-    print("Log file directory: ", os.path.abspath(log_directory))
-    print("All the data flow tracking log files are:", ", ".join(log_files))
-    for missing_file in (file for file in log_files if not os.path.exists(file)):
-        print(f"Warning: Log file {missing_file} is missing.")
-    if args.monitor_gpu:
-        print("All the GPU utilization log files are:", ", ".join(gpu_utilization_log_files))
-        for missing_file in (file for file in gpu_utilization_log_files):
-            print(f"Warning: Utilization log file {missing_file} is missing.")
+    if any(log_info[log_type]["missing"] for log_type in ("log", "gpu")):
+        logger.error("Some log files are missing. Please check the log directory.")
+        sys.exit(1)
 
 
 if __name__ == "__main__":

diff --git a/run b/run
@@ -866,7 +866,10 @@ for k, v in obj[project_type]["run"].items():
    run_command $environment
    run_command $workdir
    run_command $command $extra_args
+   command_status="$?"
    run_command $reset_environment
+
+   return ${command_status}
 }
 
 #===========================================================================================
@@ -1030,7 +1033,6 @@ main() {
             fi
             ;;
     esac
-    ret=$?
 }
 
 main "$@"