From 4c5b5f67e366f97f8af31e3e0220ac27ce0348ed Mon Sep 17 00:00:00 2001 From: Premek Vysoky Date: Tue, 23 Nov 2021 15:50:35 +0100 Subject: [PATCH 1/4] Include exit code in XHarness retry/reboot telemetry --- .../xharness-event-processor.py | 55 ++++++++++--------- .../xharness-runner/xharness-runner.apple.sh | 1 + 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-event-processor.py b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-event-processor.py index 8b7583c9676..9efac036288 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-event-processor.py +++ b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-event-processor.py @@ -34,6 +34,10 @@ output_directory = os.getenv('HELIX_WORKITEM_UPLOAD_ROOT') retry = False reboot = False +retry_dimensions = None +reboot_dimensions = None +retry_exit_code = -1 +reboot_exit_code = -1 def remove_android_apps(device: str = None): """ Removes all Android applications from the target device/emulator @@ -67,36 +71,36 @@ def remove_android_apps(device: str = None): except Exception as e: print(f' Failed to remove app: {e}') -def analyze_operation(command: str, platform: str, device: str, isDevice: bool, target: str, exitCode: int): +def analyze_operation(command: str, platform: str, device: str, is_device: bool, target: str, exit_code: int): """ Analyzes the result and requests retry/reboot in case of an infra failure Too see where the exit code values come from, see https://github.com/dotnet/xharness/blob/master/src/Microsoft.DotNet.XHarness.Common/CLI/ExitCode.cs """ - print(f'Analyzing {platform}/{command}@{target} ({exitCode})') + print(f'Analyzing {platform}/{command}@{target} ({exit_code})') global retry, reboot if platform == "android": - if exitCode == 85: # ADB_DEVICE_ENUMERATION_FAILURE + if exit_code == 85: # ADB_DEVICE_ENUMERATION_FAILURE # This handles issues where devices or emulators fail to start. # The only solution is to reboot the machine, so we request a work item retry + agent reboot when this happens print(' Encountered ADB_DEVICE_ENUMERATION_FAILURE. This is typically not a failure of the work item. It will be run again. This machine will reboot to help its devices') print(' If this occurs repeatedly, please check for architectural mismatch, e.g. sending arm64_v8a APKs to an x86_64 / x86 only queue.') - if not isDevice and os.name != 'nt': + if not is_device and os.name != 'nt': # Copy emulator log subprocess.call(['cp', '/tmp/*-logcat.log', output_directory]) reboot = True retry = True - if exitCode == 78: # PACKAGE_INSTALLATION_FAILURE + if exit_code == 78: # PACKAGE_INSTALLATION_FAILURE # This handles issues where APKs fail to install. # We already reboot a device inside XHarness and now request a work item retry when this happens print(' Encountered PACKAGE_INSTALLATION_FAILURE. This is typically not a failure of the work item. We will try it again on another Helix agent') print(' If this occurs repeatedly, please check for architectural mismatch, e.g. requesting installation on arm64_v8a-only queue for x86 or x86_64 APKs.') - if isDevice: + if is_device: try: remove_android_apps(device) except Exception as e: @@ -108,46 +112,46 @@ def analyze_operation(command: str, platform: str, device: str, isDevice: bool, retry_message = 'This is typically not a failure of the work item. It will be run again. ' reboot_message = 'This machine will reboot to heal.' - if isDevice: + if is_device: # If we fail to find a real device, it is unexpected as device queues should have one # It can often be fixed with a reboot - if exitCode == 81: # DEVICE_NOT_FOUND + if exit_code == 81: # DEVICE_NOT_FOUND print(f' Requested tethered Apple device not found. {retry_message}{reboot_message}') reboot = True retry = True # Devices can be locked or in a corrupted state, in this case we only retry the work item - if exitCode == 89: # DEVICE_FAILURE + if exit_code == 89: # DEVICE_FAILURE print(f' Failed to launch the simulator. {retry_message}') retry = True else: # Kill the simulator when we fail to launch the app - if exitCode == 80: # APP_CRASH + if exit_code == 80: # APP_CRASH simulator_app = os.getenv('SIMULATOR_APP') subprocess.call(['sudo', 'pkill', '-9', '-f', simulator_app]) # If we have a launch failure on simulators, we want a reboot+retry - if exitCode == 83: # APP_LAUNCH_FAILURE + if exit_code == 83: # APP_LAUNCH_FAILURE print(f' Encountered APP_LAUNCH_FAILURE. {retry_message}{reboot_message}') reboot = True retry = True # If we fail to find a simulator and we are not targeting a specific version (e.g. `ios-simulator_13.5`), # it is probably an issue because Xcode should always have at least one runtime version inside - if exitCode == 81 and '_' not in target: # DEVICE_NOT_FOUND + if exit_code == 81 and '_' not in target: # DEVICE_NOT_FOUND print(f' No simulator runtime found. {retry_message}') retry = True # Simulators are known to slow down which results in installation taking several minutes # Retry+reboot usually resolves this - if exitCode == 86: # APP_INSTALLATION_TIMEOUT + if exit_code == 86: # APP_INSTALLATION_TIMEOUT print(f' Installation timed out. {retry_message}{reboot_message}') reboot = True retry = True # Simulators are known to slow/break down and a reboot usually helps # This manifest by us not being able to launch the simulator - if exitCode == 88: # SIMULATOR_FAILURE + if exit_code == 88: # SIMULATOR_FAILURE print(f' Failed to launch the simulator. {retry_message}{reboot_message}') reboot = True retry = True @@ -157,45 +161,44 @@ def analyze_operation(command: str, platform: str, device: str, isDevice: bool, print(f"Reporting {len(operations)} events from diagnostics file `{diagnostics_file}`") -retry_dimensions = None -reboot_dimensions = None - # Parse operations, analyze them and send them to Application Insights for operation in operations: command = operation['command'] platform = operation['platform'] - exitCode = operation['exitCode'] + exit_code = operation['exitCode'] duration = operation['duration'] device = operation.get('device') target = operation.get('target') - targetOS = operation.get('targetOS') - isDevice = operation.get('isDevice', False) + target_os = operation.get('targetOS') + is_device = operation.get('isDevice', False) try: - analyze_operation(command, platform, device, isDevice, target, exitCode) + analyze_operation(command, platform, device, is_device, target, exit_code) except Exception as e: print(f' Failed to analyze operation: {e}') custom_dimensions = dict() custom_dimensions['command'] = operation['command'] custom_dimensions['platform'] = operation['platform'] + custom_dimensions['isDevice'] = 'true' if operation['isDevice'] else 'false' if 'target' in operation: if 'targetOS' in operation: - custom_dimensions['target'] = target + '_' + targetOS + custom_dimensions['target'] = target + '_' + target_os else: custom_dimensions['target'] = target elif 'targetOS' in operation: - custom_dimensions['target'] = targetOS + custom_dimensions['target'] = target_os # Note down the dimensions that caused retry/reboot if retry and retry_dimensions is None: retry_dimensions = custom_dimensions + retry_exit_code = exit_code if reboot and reboot_dimensions is None: reboot_dimensions = custom_dimensions - app_insights.send_metric('XHarnessOperation', exitCode, properties=custom_dimensions) + app_insights.send_metric('XHarnessOperation', exit_code, properties=custom_dimensions) app_insights.send_metric('XHarnessOperationDuration', duration, properties=custom_dimensions) # Retry / reboot is handled here @@ -208,9 +211,9 @@ def analyze_operation(command: str, platform: str, device: str, isDevice: bool, reboot = True if retry: - app_insights.send_metric('XHarnessRetry', 1, properties=retry_dimensions) + app_insights.send_metric('XHarnessRetry', retry_exit_code, properties=retry_dimensions) request_infra_retry('Requesting work item retry because an infrastructure issue was detected on this machine') if reboot: - app_insights.send_metric('XHarnessReboot', 1, properties=reboot_dimensions) + app_insights.send_metric('XHarnessReboot', reboot_exit_code, properties=reboot_dimensions) request_reboot('Requesting machine reboot as an infrastructure issue was detected on this machine') diff --git a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-runner.apple.sh b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-runner.apple.sh index e08412c287d..2c1bfef1d94 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-runner.apple.sh +++ b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-runner.apple.sh @@ -175,6 +175,7 @@ sudo chown -R helix-runner "$output_directory" chmod -R 0766 "$output_directory" # Remove empty files +echo "Removing empty log files:" find "$output_directory" -name "*.log" -maxdepth 1 -size 0 -print -delete # Rename test result XML so that AzDO reporter recognizes it From 34612ce90ac97fbb1900a0b5c4107dce7d41e152 Mon Sep 17 00:00:00 2001 From: Premek Vysoky Date: Tue, 23 Nov 2021 16:19:08 +0100 Subject: [PATCH 2/4] Extract constants and collect isDevice too --- .../xharness-runner/xharness-event-processor.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-event-processor.py b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-event-processor.py index 9efac036288..12943eae73e 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-event-processor.py +++ b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-event-processor.py @@ -11,6 +11,12 @@ ### The diagnostics.json file contains information about each XHarness command executed during the job ### In case of events that suggest infrastructure issues, we request a retry and for some reboot the agent +# Name of metrics we send to Helix +OPERATION_METRIC_NAME = 'XHarnessOperation' +DURATION_METRIC_NAME = 'XHarnessOperationDuration' +RETRY_METRIC_NAME = 'XHarnessRetry' +REBOOT_METRIC_NAME = 'XHarnessReboot' + opts, args = getopt.gnu_getopt(sys.argv[1:], 'd:', ['diagnostics-data=']) opt_dict = dict(opts) @@ -32,6 +38,9 @@ exit(2) output_directory = os.getenv('HELIX_WORKITEM_UPLOAD_ROOT') + +# For the first operation that causes a retry/reboot, we send a metric to Helix +# Retry/reboot can be also asked for by the client (by creating .retry/.reboot files) retry = False reboot = False retry_dimensions = None @@ -198,8 +207,8 @@ def analyze_operation(command: str, platform: str, device: str, is_device: bool, if reboot and reboot_dimensions is None: reboot_dimensions = custom_dimensions - app_insights.send_metric('XHarnessOperation', exit_code, properties=custom_dimensions) - app_insights.send_metric('XHarnessOperationDuration', duration, properties=custom_dimensions) + app_insights.send_metric(OPERATION_METRIC_NAME, exit_code, properties=custom_dimensions) + app_insights.send_metric(DURATION_METRIC_NAME, duration, properties=custom_dimensions) # Retry / reboot is handled here script_dir = os.getenv('HELIX_WORKITEM_ROOT') @@ -211,9 +220,9 @@ def analyze_operation(command: str, platform: str, device: str, is_device: bool, reboot = True if retry: - app_insights.send_metric('XHarnessRetry', retry_exit_code, properties=retry_dimensions) + app_insights.send_metric(RETRY_METRIC_NAME, retry_exit_code, properties=retry_dimensions) request_infra_retry('Requesting work item retry because an infrastructure issue was detected on this machine') if reboot: - app_insights.send_metric('XHarnessReboot', reboot_exit_code, properties=reboot_dimensions) + app_insights.send_metric(REBOOT_METRIC_NAME, reboot_exit_code, properties=reboot_dimensions) request_reboot('Requesting machine reboot as an infrastructure issue was detected on this machine') From 5bc29050e68f1994793d843a4fdc12e65557cae7 Mon Sep 17 00:00:00 2001 From: Premek Vysoky Date: Tue, 23 Nov 2021 16:24:34 +0100 Subject: [PATCH 3/4] Remove a comment --- .../Sdk/tools/xharness-runner/xharness-runner.apple.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-runner.apple.sh b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-runner.apple.sh index 2c1bfef1d94..a8647b57d38 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-runner.apple.sh +++ b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-runner.apple.sh @@ -174,7 +174,6 @@ fi sudo chown -R helix-runner "$output_directory" chmod -R 0766 "$output_directory" -# Remove empty files echo "Removing empty log files:" find "$output_directory" -name "*.log" -maxdepth 1 -size 0 -print -delete From f999553258ed29858e0d8f5293f3c1a24a4d6f56 Mon Sep 17 00:00:00 2001 From: Premek Vysoky Date: Tue, 23 Nov 2021 16:27:46 +0100 Subject: [PATCH 4/4] Use helix.public API --- .../Sdk/tools/xharness-runner/xharness-event-processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-event-processor.py b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-event-processor.py index 12943eae73e..98a44fce973 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-event-processor.py +++ b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-event-processor.py @@ -5,7 +5,7 @@ import sys from helix.appinsights import app_insights -from helix.workitemutil import request_reboot, request_infra_retry +from helix.public import request_reboot, request_infra_retry ### This script's purpose is to parse the diagnostics.json file produced by XHarness, evaluate it and send it to AppInsights ### The diagnostics.json file contains information about each XHarness command executed during the job