From f3e53478ece9fc78d97c1b20821a1dfc61d37789 Mon Sep 17 00:00:00 2001 From: Suket Sharma Date: Mon, 24 Jun 2024 13:27:32 -0700 Subject: [PATCH 1/4] Add nvidia-bug-report to eks-logs-collector --- log-collector-script/linux/eks-log-collector.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/log-collector-script/linux/eks-log-collector.sh b/log-collector-script/linux/eks-log-collector.sh index 7453b05a2..20b4f5040 100755 --- a/log-collector-script/linux/eks-log-collector.sh +++ b/log-collector-script/linux/eks-log-collector.sh @@ -63,6 +63,7 @@ COMMON_DIRECTORIES=( kubelet # eks nodeadm # eks cni # eks + gpu # eks ) COMMON_LOGS=( @@ -287,6 +288,7 @@ collect() { get_sandboxImage_info get_cpu_throttled_processes get_io_throttled_processes + get_nvidia_bug_report } pack() { @@ -796,6 +798,16 @@ get_io_throttled_processes() { ok } +get_nvidia_bug_report() { + try "Collect Nvidia Bug report" + if ! command -v nvidia-bug-report.sh &> /dev/null; then + echo "No Nvidia drivers found, nothing to do." + else + timeout 75 command nvidia-bug-report.sh --output-file "${COLLECT_DIR}"/gpu/nvidia-bug-report.log &> /dev/null + fi + ok +} + # ----------------------------------------------------------------------------- # Entrypoint parse_options "$@" From f88c482a18ce2331bd007bf9909a91a2cc12f9ed Mon Sep 17 00:00:00 2001 From: Suket Sharma Date: Mon, 24 Jun 2024 13:33:13 -0700 Subject: [PATCH 2/4] Fixing linter error --- log-collector-script/linux/eks-log-collector.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/log-collector-script/linux/eks-log-collector.sh b/log-collector-script/linux/eks-log-collector.sh index 20b4f5040..1ab8ddb1e 100755 --- a/log-collector-script/linux/eks-log-collector.sh +++ b/log-collector-script/linux/eks-log-collector.sh @@ -800,7 +800,7 @@ get_io_throttled_processes() { get_nvidia_bug_report() { try "Collect Nvidia Bug report" - if ! command -v nvidia-bug-report.sh &> /dev/null; then + if ! command -v nvidia-bug-report.sh &> /dev/null; then echo "No Nvidia drivers found, nothing to do." else timeout 75 command nvidia-bug-report.sh --output-file "${COLLECT_DIR}"/gpu/nvidia-bug-report.log &> /dev/null From cf280e92d195fe5dfe42fa201283e05537ed0ba6 Mon Sep 17 00:00:00 2001 From: Suket Sharma Date: Mon, 24 Jun 2024 13:39:56 -0700 Subject: [PATCH 3/4] Fixing linter again oof --- log-collector-script/linux/eks-log-collector.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/log-collector-script/linux/eks-log-collector.sh b/log-collector-script/linux/eks-log-collector.sh index 1ab8ddb1e..4063a1873 100755 --- a/log-collector-script/linux/eks-log-collector.sh +++ b/log-collector-script/linux/eks-log-collector.sh @@ -803,7 +803,7 @@ get_nvidia_bug_report() { if ! command -v nvidia-bug-report.sh &> /dev/null; then echo "No Nvidia drivers found, nothing to do." else - timeout 75 command nvidia-bug-report.sh --output-file "${COLLECT_DIR}"/gpu/nvidia-bug-report.log &> /dev/null + timeout 75 command nvidia-bug-report.sh --output-file "${COLLECT_DIR}"/gpu/nvidia-bug-report.log &> /dev/null fi ok } From 68ad46d547e6323cd0318b092d80f69bf073e739 Mon Sep 17 00:00:00 2001 From: Suket Sharma Date: Mon, 24 Jun 2024 14:09:21 -0700 Subject: [PATCH 4/4] Addressing comments --- log-collector-script/linux/eks-log-collector.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/log-collector-script/linux/eks-log-collector.sh b/log-collector-script/linux/eks-log-collector.sh index 4063a1873..4b74232f2 100755 --- a/log-collector-script/linux/eks-log-collector.sh +++ b/log-collector-script/linux/eks-log-collector.sh @@ -803,7 +803,7 @@ get_nvidia_bug_report() { if ! command -v nvidia-bug-report.sh &> /dev/null; then echo "No Nvidia drivers found, nothing to do." else - timeout 75 command nvidia-bug-report.sh --output-file "${COLLECT_DIR}"/gpu/nvidia-bug-report.log &> /dev/null + timeout 75 nvidia-bug-report.sh --output-file "${COLLECT_DIR}"/gpu/nvidia-bug-report.log &> /dev/null fi ok }