Skip to content

Commit

Permalink
Collect device flag and add exit code to retry/reboot events (#8213)
Browse files Browse the repository at this point in the history
  • Loading branch information
premun authored Nov 23, 2021
1 parent 977c461 commit e52f750
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@
import sys

from helix.appinsights import app_insights
from helix.workitemutil import request_reboot, request_infra_retry
from helix.public import request_reboot, request_infra_retry

### This script's purpose is to parse the diagnostics.json file produced by XHarness, evaluate it and send it to AppInsights
### The diagnostics.json file contains information about each XHarness command executed during the job
### In case of events that suggest infrastructure issues, we request a retry and for some reboot the agent

# Name of metrics we send to Helix
OPERATION_METRIC_NAME = 'XHarnessOperation'
DURATION_METRIC_NAME = 'XHarnessOperationDuration'
RETRY_METRIC_NAME = 'XHarnessRetry'
REBOOT_METRIC_NAME = 'XHarnessReboot'

opts, args = getopt.gnu_getopt(sys.argv[1:], 'd:', ['diagnostics-data='])
opt_dict = dict(opts)

Expand All @@ -32,8 +38,15 @@
exit(2)

output_directory = os.getenv('HELIX_WORKITEM_UPLOAD_ROOT')

# For the first operation that causes a retry/reboot, we send a metric to Helix
# Retry/reboot can be also asked for by the client (by creating .retry/.reboot files)
retry = False
reboot = False
retry_dimensions = None
reboot_dimensions = None
retry_exit_code = -1
reboot_exit_code = -1

def remove_android_apps(device: str = None):
""" Removes all Android applications from the target device/emulator
Expand Down Expand Up @@ -67,36 +80,36 @@ def remove_android_apps(device: str = None):
except Exception as e:
print(f' Failed to remove app: {e}')

def analyze_operation(command: str, platform: str, device: str, isDevice: bool, target: str, exitCode: int):
def analyze_operation(command: str, platform: str, device: str, is_device: bool, target: str, exit_code: int):
""" Analyzes the result and requests retry/reboot in case of an infra failure
Too see where the exit code values come from, see https://github.com/dotnet/xharness/blob/master/src/Microsoft.DotNet.XHarness.Common/CLI/ExitCode.cs
"""

print(f'Analyzing {platform}/{command}@{target} ({exitCode})')
print(f'Analyzing {platform}/{command}@{target} ({exit_code})')

global retry, reboot

if platform == "android":
if exitCode == 85: # ADB_DEVICE_ENUMERATION_FAILURE
if exit_code == 85: # ADB_DEVICE_ENUMERATION_FAILURE
# This handles issues where devices or emulators fail to start.
# The only solution is to reboot the machine, so we request a work item retry + agent reboot when this happens
print(' Encountered ADB_DEVICE_ENUMERATION_FAILURE. This is typically not a failure of the work item. It will be run again. This machine will reboot to help its devices')
print(' If this occurs repeatedly, please check for architectural mismatch, e.g. sending arm64_v8a APKs to an x86_64 / x86 only queue.')

if not isDevice and os.name != 'nt':
if not is_device and os.name != 'nt':
# Copy emulator log
subprocess.call(['cp', '/tmp/*-logcat.log', output_directory])

reboot = True
retry = True

if exitCode == 78: # PACKAGE_INSTALLATION_FAILURE
if exit_code == 78: # PACKAGE_INSTALLATION_FAILURE
# This handles issues where APKs fail to install.
# We already reboot a device inside XHarness and now request a work item retry when this happens
print(' Encountered PACKAGE_INSTALLATION_FAILURE. This is typically not a failure of the work item. We will try it again on another Helix agent')
print(' If this occurs repeatedly, please check for architectural mismatch, e.g. requesting installation on arm64_v8a-only queue for x86 or x86_64 APKs.')

if isDevice:
if is_device:
try:
remove_android_apps(device)
except Exception as e:
Expand All @@ -108,46 +121,46 @@ def analyze_operation(command: str, platform: str, device: str, isDevice: bool,
retry_message = 'This is typically not a failure of the work item. It will be run again. '
reboot_message = 'This machine will reboot to heal.'

if isDevice:
if is_device:
# If we fail to find a real device, it is unexpected as device queues should have one
# It can often be fixed with a reboot
if exitCode == 81: # DEVICE_NOT_FOUND
if exit_code == 81: # DEVICE_NOT_FOUND
print(f' Requested tethered Apple device not found. {retry_message}{reboot_message}')
reboot = True
retry = True

# Devices can be locked or in a corrupted state, in this case we only retry the work item
if exitCode == 89: # DEVICE_FAILURE
if exit_code == 89: # DEVICE_FAILURE
print(f' Failed to launch the simulator. {retry_message}')
retry = True
else:
# Kill the simulator when we fail to launch the app
if exitCode == 80: # APP_CRASH
if exit_code == 80: # APP_CRASH
simulator_app = os.getenv('SIMULATOR_APP')
subprocess.call(['sudo', 'pkill', '-9', '-f', simulator_app])

# If we have a launch failure on simulators, we want a reboot+retry
if exitCode == 83: # APP_LAUNCH_FAILURE
if exit_code == 83: # APP_LAUNCH_FAILURE
print(f' Encountered APP_LAUNCH_FAILURE. {retry_message}{reboot_message}')
reboot = True
retry = True

# If we fail to find a simulator and we are not targeting a specific version (e.g. `ios-simulator_13.5`),
# it is probably an issue because Xcode should always have at least one runtime version inside
if exitCode == 81 and '_' not in target: # DEVICE_NOT_FOUND
if exit_code == 81 and '_' not in target: # DEVICE_NOT_FOUND
print(f' No simulator runtime found. {retry_message}')
retry = True

# Simulators are known to slow down which results in installation taking several minutes
# Retry+reboot usually resolves this
if exitCode == 86: # APP_INSTALLATION_TIMEOUT
if exit_code == 86: # APP_INSTALLATION_TIMEOUT
print(f' Installation timed out. {retry_message}{reboot_message}')
reboot = True
retry = True

# Simulators are known to slow/break down and a reboot usually helps
# This manifest by us not being able to launch the simulator
if exitCode == 88: # SIMULATOR_FAILURE
if exit_code == 88: # SIMULATOR_FAILURE
print(f' Failed to launch the simulator. {retry_message}{reboot_message}')
reboot = True
retry = True
Expand All @@ -157,46 +170,45 @@ def analyze_operation(command: str, platform: str, device: str, isDevice: bool,

print(f"Reporting {len(operations)} events from diagnostics file `{diagnostics_file}`")

retry_dimensions = None
reboot_dimensions = None

# Parse operations, analyze them and send them to Application Insights
for operation in operations:
command = operation['command']
platform = operation['platform']
exitCode = operation['exitCode']
exit_code = operation['exitCode']
duration = operation['duration']
device = operation.get('device')
target = operation.get('target')
targetOS = operation.get('targetOS')
isDevice = operation.get('isDevice', False)
target_os = operation.get('targetOS')
is_device = operation.get('isDevice', False)

try:
analyze_operation(command, platform, device, isDevice, target, exitCode)
analyze_operation(command, platform, device, is_device, target, exit_code)
except Exception as e:
print(f' Failed to analyze operation: {e}')

custom_dimensions = dict()
custom_dimensions['command'] = operation['command']
custom_dimensions['platform'] = operation['platform']
custom_dimensions['isDevice'] = 'true' if operation['isDevice'] else 'false'

if 'target' in operation:
if 'targetOS' in operation:
custom_dimensions['target'] = target + '_' + targetOS
custom_dimensions['target'] = target + '_' + target_os
else:
custom_dimensions['target'] = target
elif 'targetOS' in operation:
custom_dimensions['target'] = targetOS
custom_dimensions['target'] = target_os

# Note down the dimensions that caused retry/reboot
if retry and retry_dimensions is None:
retry_dimensions = custom_dimensions
retry_exit_code = exit_code

if reboot and reboot_dimensions is None:
reboot_dimensions = custom_dimensions

app_insights.send_metric('XHarnessOperation', exitCode, properties=custom_dimensions)
app_insights.send_metric('XHarnessOperationDuration', duration, properties=custom_dimensions)
app_insights.send_metric(OPERATION_METRIC_NAME, exit_code, properties=custom_dimensions)
app_insights.send_metric(DURATION_METRIC_NAME, duration, properties=custom_dimensions)

# Retry / reboot is handled here
script_dir = os.getenv('HELIX_WORKITEM_ROOT')
Expand All @@ -208,9 +220,9 @@ def analyze_operation(command: str, platform: str, device: str, isDevice: bool,
reboot = True

if retry:
app_insights.send_metric('XHarnessRetry', 1, properties=retry_dimensions)
app_insights.send_metric(RETRY_METRIC_NAME, retry_exit_code, properties=retry_dimensions)
request_infra_retry('Requesting work item retry because an infrastructure issue was detected on this machine')

if reboot:
app_insights.send_metric('XHarnessReboot', 1, properties=reboot_dimensions)
app_insights.send_metric(REBOOT_METRIC_NAME, reboot_exit_code, properties=reboot_dimensions)
request_reboot('Requesting machine reboot as an infrastructure issue was detected on this machine')
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ fi
sudo chown -R helix-runner "$output_directory"
chmod -R 0766 "$output_directory"

# Remove empty files
echo "Removing empty log files:"
find "$output_directory" -name "*.log" -maxdepth 1 -size 0 -print -delete

# Rename test result XML so that AzDO reporter recognizes it
Expand Down

0 comments on commit e52f750

Please sign in to comment.