Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use XHarness' diagnostics output to detect infrastructure issues #8139

Merged
merged 11 commits into from
Nov 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
</ItemGroup>

<ItemGroup>
<EmbeddedResource Include="tools\xharness-runner\xharness-event-reporter.py">
<EmbeddedResource Include="tools\xharness-runner\xharness-event-processor.py">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</EmbeddedResource>
<EmbeddedResource Include="tools\xharness-runner\xharness-helix-job.apple.sh">
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.DotNet.Helix/Sdk/XharnessTaskBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public class MetadataName

protected const string ScriptNamespace = "tools.xharness_runner.";
private const string CustomCommandsScript = "command";
private const string DiagnosticsScript = "xharness-event-reporter.py";
private const string DiagnosticsScript = "xharness-event-processor.py";

/// <summary>
/// Extra arguments that will be passed to the iOS/Android/... app that is being run
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
<HelixPreCommands>$(HelixPreCommands);export XHARNESS_CLI_PATH=$HELIX_CORRELATION_PAYLOAD/microsoft.dotnet.xharness.cli/$(_XHarnessPackageVersion)/tools/net6.0/any/Microsoft.DotNet.XHarness.CLI.dll</HelixPreCommands>

<HelixPreCommands Condition=" '$(EnableXHarnessTelemetry)' == 'true' ">$(HelixPreCommands);export XHARNESS_DIAGNOSTICS_PATH=$HELIX_WORKITEM_ROOT/diagnostics.json</HelixPreCommands>
<HelixPostCommands Condition=" '$(EnableXHarnessTelemetry)' == 'true' ">"$HELIX_PYTHONPATH" "$HELIX_WORKITEM_PAYLOAD/xharness-event-reporter.py";$(HelixPostCommands)</HelixPostCommands>
<HelixPostCommands Condition=" '$(EnableXHarnessTelemetry)' == 'true' ">"$HELIX_PYTHONPATH" "$HELIX_WORKITEM_PAYLOAD/xharness-event-processor.py";$(HelixPostCommands)</HelixPostCommands>
</PropertyGroup>

<PropertyGroup Condition="!$(IsPosixShell)">
Expand All @@ -85,7 +85,7 @@
<HelixPreCommands>$(HelixPreCommands);doskey xharness="dotnet exec %XHARNESS_CLI_PATH%"</HelixPreCommands>

<HelixPreCommands Condition=" '$(EnableXHarnessTelemetry)' == 'true' ">$(HelixPreCommands);set XHARNESS_DIAGNOSTICS_PATH=%HELIX_WORKITEM_ROOT%\diagnostics.json</HelixPreCommands>
<HelixPostCommands Condition=" '$(EnableXHarnessTelemetry)' == 'true' ">"%HELIX_PYTHONPATH%" "%HELIX_WORKITEM_PAYLOAD%\xharness-event-reporter.py";$(HelixPostCommands)</HelixPostCommands>
<HelixPostCommands Condition=" '$(EnableXHarnessTelemetry)' == 'true' ">"%HELIX_PYTHONPATH%" "%HELIX_WORKITEM_PAYLOAD%\xharness-event-processor.py";$(HelixPostCommands)</HelixPostCommands>
</PropertyGroup>
</Target>

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
import getopt
import json
import os
import subprocess
import sys

from helix.appinsights import app_insights
from helix.workitemutil import request_reboot, request_infra_retry

### This script's purpose is to parse the diagnostics.json file produced by XHarness, evaluate it and send it to AppInsights
### The diagnostics.json file contains information about each XHarness command executed during the job
### In case of events that suggest infrastructure issues, we request a retry and for some reboot the agent

opts, args = getopt.gnu_getopt(sys.argv[1:], 'd:', ['diagnostics-data='])
opt_dict = dict(opts)

diagnostics_file = None

if '--data' in opt_dict:
diagnostics_file = opt_dict['--data']
elif '-d' in opt_dict:
diagnostics_file = opt_dict['-d']
else:
diagnostics_file = os.getenv('XHARNESS_DIAGNOSTICS_PATH')

if not diagnostics_file:
print('ERROR: Expected path to the diagnostics JSON file generated by XHarness')
exit(1)

if not os.path.isfile(diagnostics_file):
print(f"WARNING: Diagnostics file not found at `{diagnostics_file}`")
exit(2)

output_directory = os.getenv('HELIX_WORKITEM_UPLOAD_ROOT')
retry = False
reboot = False

def remove_android_apps(device: str = None):
""" Removes all Android applications from the target device/emulator
"""

print(f' Removing installed apps after unsuccessful run' + (' from ' + device if device else ""))

xharness_cli_path = os.getenv('XHARNESS_CLI_PATH')
adb_path = subprocess.check_output(['dotnet', 'exec', xharness_cli_path, 'android', 'state', '--adb']).decode('utf-8').strip()

# Get list of installed apps
if device:
installed_apps = subprocess.check_output([adb_path, '-s', device, 'shell', 'pm', 'list', 'packages', 'net.dot']).decode('utf-8').splitlines()
else:
installed_apps = subprocess.check_output([adb_path, 'shell', 'pm', 'list', 'packages', 'net.dot']).decode('utf-8').splitlines()

installed_apps = [app.split(':')[1] for app in installed_apps if app]

# Remove all installed apps
for app in installed_apps:
print(f' Removing {app}')

try:
if device:
result = subprocess.run([adb_path, '-s', device, 'uninstall', app], stdout=subprocess.PIPE)
else:
result = subprocess.run([adb_path, 'uninstall', app], stderr=subprocess.STDOUT)

output = result.stdout.decode('utf8')
print(f' {output}')
except Exception as e:
print(f' Failed to remove app: {e}')

def analyze_operation(command: str, platform: str, device: str, isDevice: bool, target: str, exitCode: int):
""" Analyzes the result and requests retry/reboot in case of an infra failure
Too see where the exit code values come from, see https://github.com/dotnet/xharness/blob/master/src/Microsoft.DotNet.XHarness.Common/CLI/ExitCode.cs
"""

print(f'Analyzing {platform}/{command}@{target} ({exitCode})')

global retry, reboot

if platform == "android":
if exitCode == 85: # ADB_DEVICE_ENUMERATION_FAILURE
# This handles issues where devices or emulators fail to start.
# The only solution is to reboot the machine, so we request a work item retry + agent reboot when this happens
print(' Encountered ADB_DEVICE_ENUMERATION_FAILURE. This is typically not a failure of the work item. It will be run again. This machine will reboot to help its devices')
print(' If this occurs repeatedly, please check for architectural mismatch, e.g. sending arm64_v8a APKs to an x86_64 / x86 only queue.')

if not isDevice and os.name != 'nt':
# Copy emulator log
subprocess.call(['cp', '/tmp/*-logcat.log', output_directory])

reboot = True
retry = True

if exitCode == 78: # PACKAGE_INSTALLATION_FAILURE
# This handles issues where APKs fail to install.
# We already reboot a device inside XHarness and now request a work item retry when this happens
print(' Encountered PACKAGE_INSTALLATION_FAILURE. This is typically not a failure of the work item. We will try it again on another Helix agent')
print(' If this occurs repeatedly, please check for architectural mismatch, e.g. requesting installation on arm64_v8a-only queue for x86 or x86_64 APKs.')

if isDevice:
try:
remove_android_apps(device)
except Exception as e:
print(f' Failed to remove installed apps from device: {e}')

retry = True

elif platform == "apple":
retry_message = 'This is typically not a failure of the work item. It will be run again. '
reboot_message = 'This machine will reboot to heal.'

if isDevice:
# If we fail to find a real device, it is unexpected as device queues should have one
# It can often be fixed with a reboot
if exitCode == 81: # DEVICE_NOT_FOUND
print(f' Requested tethered Apple device not found. {retry_message}{reboot_message}')
reboot = True
retry = True

# Devices can be locked or in a corrupted state, in this case we only retry the work item
if exitCode == 89: # DEVICE_FAILURE
print(f' Failed to launch the simulator. {retry_message}')
retry = True
else:
# Kill the simulator when we fail to launch the app
if exitCode == 80: # APP_CRASH
simulator_app = os.getenv('SIMULATOR_APP')
subprocess.call(['sudo', 'pkill', '-9', '-f', simulator_app])

# If we have a launch failure on simulators, we want a reboot+retry
if exitCode == 83: # APP_LAUNCH_FAILURE
print(f' Encountered APP_LAUNCH_FAILURE. {retry_message}{reboot_message}')
reboot = True
retry = True

# If we fail to find a simulator and we are not targeting a specific version (e.g. `ios-simulator_13.5`),
# it is probably an issue because Xcode should always have at least one runtime version inside
if exitCode == 81 and '_' not in target: # DEVICE_NOT_FOUND
print(f' No simulator runtime found. {retry_message}')
retry = True

# Simulators are known to slow down which results in installation taking several minutes
# Retry+reboot usually resolves this
if exitCode == 86: # APP_INSTALLATION_TIMEOUT
print(f' Installation timed out. {retry_message}{reboot_message}')
reboot = True
retry = True

# Simulators are known to slow/break down and a reboot usually helps
# This manifest by us not being able to launch the simulator
if exitCode == 88: # SIMULATOR_FAILURE
print(f' Failed to launch the simulator. {retry_message}{reboot_message}')
reboot = True
retry = True

# The JSON should be an array of objects (one per each executed XHarness command)
operations = json.load(open(diagnostics_file))

print(f"Reporting {len(operations)} events from diagnostics file `{diagnostics_file}`")

# Parse operations, analyze them and send them to Application Insights

for operation in operations:
command = operation['command']
platform = operation['platform']
exitCode = operation['exitCode']
duration = operation['duration']
device = operation.get('device')
target = operation.get('target')
targetOS = operation.get('targetOS')
isDevice = operation.get('isDevice', False)

try:
analyze_operation(command, platform, device, isDevice, target, exitCode)
except Exception as e:
print(f' Failed to analyze operation: {e}')

custom_dimensions = dict()
custom_dimensions['command'] = operation['command']
custom_dimensions['platform'] = operation['platform']

if 'target' in operation:
if 'targetOS' in operation:
custom_dimensions['target'] = target + '_' + targetOS
else:
custom_dimensions['target'] = target
elif 'targetOS' in operation:
custom_dimensions['target'] = targetOS

app_insights.send_metric('XHarnessOperation', exitCode, properties=custom_dimensions)
app_insights.send_metric('XHarnessOperationDuration', duration, properties=custom_dimensions)

# Retry / reboot is handled here
script_dir = os.getenv('HELIX_WORKITEM_ROOT')

if os.path.exists(os.path.join(script_dir, '.retry')):
retry = True

if os.path.exists(os.path.join(script_dir, '.reboot')):
reboot = True

if retry:
request_infra_retry('Requesting work item retry because an infrastructure issue was detected on this machine')

if reboot:
request_reboot('Requesting machine reboot as an infrastructure issue was detected on this machine')

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
<#
This script is used as a payload of Helix jobs that execute Android workloads through XHarness on Windows systems.
This is used as the entrypoint of the work item so that XHarness failures can be detected and (when appropriate)
cause the work item to retry and reboot the Helix agent the work is running on.

Currently no special functionality is needed beyond causing infrastructure retry and reboot if the emulators
or devices have trouble, but to add more Helix-specific Android XHarness behaviors, this is one extensibility point.
The purpose of this script is to time-constrain user commands (command.ps1) so that we have time at the end of the
work item to process XHarness telemetry.
#>

param (
Expand Down Expand Up @@ -41,78 +39,24 @@ $process.Start()

Wait-Process -InputObject $process -TimeOut $command_timeout -ErrorVariable ev -ErrorAction SilentlyContinue

function Remove-Apps {
Write-Output "Removing installed apps after unsuccessful run"
$adb_path = dotnet xharness android state --adb
$packages = & $adb_path shell pm list packages net.dot
$split_packages = $packages.split(':')
For ($i = 1; $i -lt $split_packages.Length; $i += 2) {
echo $split_packages[$i] | %{&$adb_path uninstall $_}
}
}

if ($ev) {
$exit_code = -3
Stop-Process -InputObject $process -Force
$process.WaitForExit()
[Console]::Out.Flush()
Write-Output "User command timed out after $command_timeout seconds!"
Remove-Apps
} else {
$exit_code = $process.ExitCode
Write-Output "User command ended with $exit_code"
}

$retry = $false
$reboot = $false

switch ($exit_code)
{
# ADB_DEVICE_ENUMERATION_FAILURE
85 {
Write-Error "Encountered ADB_DEVICE_ENUMERATION_FAILURE. This is typically not a failure of the work item. We will run it again and reboot this computer to help its devices"
Write-Error "If this occurs repeatedly, please check for architectural mismatch, e.g. sending x86 or x86_64 APKs to an arm64_v8a-only queue."
$retry = $true
$reboot = $true
Break
}

# PACKAGE_INSTALLATION_FAILURE
78 {
Write-Error "Encountered PACKAGE_INSTALLATION_FAILURE. This is typically not a failure of the work item. We will try it again on another Helix agent"
Write-Error "If this occurs repeatedly, please check for architectural mismatch, e.g. requesting installation on arm64_v8a-only queue for x86 or x86_64 APKs."

Remove-Apps

$retry = $true
Break
}
}

if (Test-Path -Path "$Env:HELIX_WORKITEM_ROOT\.retry" -PathType Leaf) {
$retry = $true;
$retry_message = Get-Content -Path "$Env:HELIX_WORKITEM_ROOT\.retry"
}

if (Test-Path -Path "$Env:HELIX_WORKITEM_ROOT\.reboot" -PathType Leaf) {
$reboot = $true;
$reboot_message = Get-Content -Path "$Env:HELIX_WORKITEM_ROOT\.reboot"
}

if ($retry) {
if ([string]::IsNullOrEmpty($retry_message)) {
$retry_message = 'Retrying because we could not enumerate all Android devices'
}

& "$Env:HELIX_PYTHONPATH" -c "from helix.workitemutil import request_infra_retry; request_infra_retry('$retry_message')"
}

if ($reboot) {
if ([string]::IsNullOrEmpty($reboot_message)) {
$reboot_message = 'Rebooting to allow Android emulator to restart'
Write-Output "Removing installed apps after unsuccessful run"
$adb_path = & dotnet exec $Env:XHARNESS_CLI_PATH android state --adb
$packages = & $adb_path shell pm list packages net.dot
$split_packages = $packages.split(':')
For ($i = 1; $i -lt $split_packages.Length; $i += 2) {
Write-Output " Uninstalling $($split_packages[$i])"
$output = & $adb_path uninstall $split_packages[$i]
Write-Output " $output"
}

& "$Env:HELIX_PYTHONPATH" -c "from helix.workitemutil import request_reboot; request_reboot('$reboot_message')"
exit -3
} else {
Write-Output "User command ended with $($process.ExitCode)"
exit $process.ExitCode
}

exit $exit_code
Loading