Skip to content

Commit

Permalink
Holoscan Flow Benchmarking Logging Updates (#555)
Browse files Browse the repository at this point in the history
* Benchmark: Use `logging` module in Holoscan Flow Benchmarking script

Changes:
- Adopts standard `logging` module to enhance granular control and
  redirection of benchmark script logging
- Updates missing log files from warning to error severity

Signed-off-by: Tom Birdsong <tbirdsong@nvidia.com>

* Benchmark: Fix path check for missing flow tracking log files

Refactors code and addresses issue where log file parent directory was
not appended to path search for existence, resulting in only the local
directory being checked for log files existence.

NVBug 4933187

Signed-off-by: Tom Birdsong <tbirdsong@nvidia.com>

* Update CLI and address review comments

Changes:
- Add `--level` option to control logging verbosity and use `logger`
  object throughout
- Refactor logging so that each output filepath is printed exactly once.
  Available files are printed with INFO verbosity, missing files are
  printed with ERROR verbosity.
- Exit with error if log files are missing

Sample log printout:
```
2024-11-08 05:23:47,323 INFO: Log file directory: /workspace/holohub/log_directory_20241108-052346
2024-11-08 05:23:47,324 INFO: Log files are available:
2024-11-08 05:23:47,324 ERROR: Log files are missing: /workspace/holohub/log_directory_20241108-052346/logger_greedy_1_1.log
2024-11-08 05:23:47,324 ERROR: Some log files are missing. Please check the log directory.
```

Signed-off-by: Tom Birdsong <tbirdsong@nvidia.com>

* Infra: Propagate `run launch` command errors

Updates `run` script to propagate errors from `run launch` as the `run`
script exit code.

Allows Holoscan Flow Benchmarking tools to detect when a HoloHub
application process exits abnormally.

Signed-off-by: Tom Birdsong <tbirdsong@nvidia.com>

* Benchmark: Add file handler and process debug output

Changes:
- Adds logging handler to log detailed (DEBUG) output to a
  "benchmark.log" file in the given run log directory. Adds a stream
  handler to preserve control over CLI output verbosity with the
  "--level" CLI argument.
- Logs stdout and stderr from an failing process to aid error investigation.
  Includes thread ID.

Signed-off-by: Tom Birdsong <tbirdsong@nvidia.com>

---------

Signed-off-by: Tom Birdsong <tbirdsong@nvidia.com>
  • Loading branch information
tbirdso authored Nov 12, 2024
1 parent 25b6b63 commit e248dd4
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 23 deletions.
81 changes: 59 additions & 22 deletions benchmarks/holoscan_flow_benchmarking/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,26 @@
# limitations under the License.

import argparse
import logging
import os
import subprocess
import sys
import threading
import time
from collections import defaultdict

from nvitop import Device

logger = logging.getLogger(__name__)
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.DEBUG)

# global variables for GPU monitoring
stop_gpu_monitoring = False
stop_gpu_monitoring_lock = threading.Lock()


def monitor_gpu(gpu_uuids, filename):
print("Monitoring GPU utilization in a separate thread")
logger.info("Monitoring GPU utilization in a separate thread")
devices = []
if gpu_uuids == "all" or gpu_uuids == "" or gpu_uuids is None:
devices = Device.all()
Expand Down Expand Up @@ -67,13 +72,16 @@ def run_command(app_launch_command, env):
stderr=subprocess.PIPE,
)
except subprocess.CalledProcessError as e:
print(f"Error running command: {e}")
print(f"stdout:\n{e.stdout.decode('utf-8')}")
print(f"stderr:\n{e.stderr.decode('utf-8')}")
logger.error(f"Error running command: {e}")
logger.error(f"stdout:\n{e.stdout.decode('utf-8')}")
logger.error(f"stderr:\n{e.stderr.decode('utf-8')}")
sys.exit(1)

logger.debug(f"stdout:\n{result.stdout}")
logger.debug(f"stderr:\n{result.stderr}")
if result.returncode != 0:
# The command returned an error
print(f"Error: Command {app_launch_command} exited with code {result.returncode}")
logger.error(f'Command "{app_launch_command}" exited with code {result.returncode}')
sys.exit(1)


Expand Down Expand Up @@ -178,6 +186,7 @@ def main():
parser.add_argument(
"-u", "--monitor_gpu", action="store_true", help="enable this to monitor GPU utilization"
)
parser.add_argument("--level", type=str, default="INFO", help="Logging verbosity level")

args = parser.parse_args()

Expand All @@ -186,8 +195,8 @@ def main():
and "eventbased" not in args.sched
and args.num_worker_threads != 1
):
print(
"Warning: num_worker_threads is ignored as multithread or eventbased scheduler is not used"
logger.warning(
"num_worker_threads is ignored as multithread or eventbased scheduler is not used"
)

log_directory = None
Expand All @@ -200,12 +209,26 @@ def main():
# check if the given directory is valid or not
log_directory = args.log_directory
if not os.path.isdir(log_directory):
print(
logger.info(
"Log directory is not found. Creating a new directory at",
os.path.abspath(log_directory),
)
os.mkdir(os.path.abspath(log_directory))

# Set up detailed logging to file + custom verbosity logging to console
fileHandler = logging.FileHandler(os.path.join(log_directory, "benchmark.log"))
fileHandler.setLevel(level=logging.DEBUG)
fileHandler.setFormatter(
logging.Formatter("%(asctime)s %(threadName)s %(levelname)s: %(message)s")
)
logger.addHandler(fileHandler)

consoleHandler = logging.StreamHandler(stream=sys.stdout)
consoleHandler.setLevel(level=args.level.upper())
consoleHandler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s: %(message)s"))
logger.root.removeHandler(logger.root.handlers[0])
logger.addHandler(consoleHandler)

env = os.environ.copy()
if args.gpu != "all":
env["CUDA_VISIBLE_DEVICES"] = args.gpu
Expand All @@ -228,12 +251,11 @@ def main():
env["HOLOSCAN_SCHEDULER"] = scheduler
env["HOLOSCAN_EVENTBASED_WORKER_THREADS"] = str(args.num_worker_threads)
elif scheduler != "greedy":
print("Unsupported scheduler ", scheduler)
logger.error("Unsupported scheduler ", scheduler)
sys.exit(1)
# No need to set the scheduler for greedy scheduler

for i in range(1, args.runs + 1):
print(f"Run {i} started for {scheduler} scheduler.")
logger.info(f"Run {i} started for {scheduler} scheduler.")
instance_threads = []
if args.monitor_gpu:
gpu_utilization_logfile_name = (
Expand Down Expand Up @@ -272,19 +294,34 @@ def main():
stop_gpu_monitoring_lock.release()
gpu_monitoring_thread.join()
gpu_utilization_log_files.append(gpu_utilization_logfile_name)
print(f"Run {i} completed for {scheduler} scheduler.")
logger.info(f"Run {i} completed for {scheduler} scheduler.")
time.sleep(1) # cool down period
logger.info("****************************************************************")
logger.info("Evaluation completed.")
logger.info("****************************************************************")

logger.info(f"Log file directory: {os.path.abspath(log_directory)}")
log_info = defaultdict(lambda: defaultdict(list))
log_file_sets = [("log", log_files)] + (
[("gpu", gpu_utilization_log_files)] if args.monitor_gpu else []
)
for log_type, log_list in log_file_sets:
abs_filepaths = [
os.path.abspath(os.path.join(log_directory, log_file)) for log_file in log_list
]
log_info[log_type]["found"] = [file for file in abs_filepaths if os.path.exists(file)]
log_info[log_type]["missing"] = [file for file in abs_filepaths if not os.path.exists(file)]
logger.info(
f'{log_type.capitalize()} files are available: {", ".join(log_info[log_type]["found"])}'
)
if log_info[log_type]["missing"]:
logger.error(
f'{log_type.capitalize()} files are missing: {", ".join(log_info[log_type]["missing"])}'
)

# Just print comma-separate values of log_files
print("\nEvaluation completed.")
print("Log file directory: ", os.path.abspath(log_directory))
print("All the data flow tracking log files are:", ", ".join(log_files))
for missing_file in (file for file in log_files if not os.path.exists(file)):
print(f"Warning: Log file {missing_file} is missing.")
if args.monitor_gpu:
print("All the GPU utilization log files are:", ", ".join(gpu_utilization_log_files))
for missing_file in (file for file in gpu_utilization_log_files):
print(f"Warning: Utilization log file {missing_file} is missing.")
if any(log_info[log_type]["missing"] for log_type in ("log", "gpu")):
logger.error("Some log files are missing. Please check the log directory.")
sys.exit(1)


if __name__ == "__main__":
Expand Down
4 changes: 3 additions & 1 deletion run
Original file line number Diff line number Diff line change
Expand Up @@ -866,7 +866,10 @@ for k, v in obj[project_type]["run"].items():
run_command $environment
run_command $workdir
run_command $command $extra_args
command_status="$?"
run_command $reset_environment

return ${command_status}
}

#===========================================================================================
Expand Down Expand Up @@ -1030,7 +1033,6 @@ main() {
fi
;;
esac
ret=$?
}

main "$@"

0 comments on commit e248dd4

Please sign in to comment.