-
Notifications
You must be signed in to change notification settings - Fork 5
/
flakyhunter
executable file
·380 lines (317 loc) · 21.3 KB
/
flakyhunter
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
#!/usr/bin/env python3
import argparse
import subprocess
import signal
import sys
import os
import multiprocessing
from hunterpy.definitions import bots, NOERROR, UNKNOWN
from hunterpy.jsonresults import ResultsInterpreter, ResultsParser
from hunterpy.utils import OptionalColorText, sigterm_this_process
from collections import Counter
def get_percent(part, total):
return float(100) * float(part) / float(total)
def get_percent_str(part, total):
percent_part = get_percent(part, total)
return "%d%%" % int(percent_part) if percent_part.is_integer() else "%.1f%%" % percent_part
class LocalGitWebKitRepoInfoChecker():
def __init__(self, webkitdir):
self.webkitdir = webkitdir
if not os.path.isdir(webkitdir):
raise ValueError('%s is not a directory' % webkitdir)
if not os.path.isdir(os.path.join(webkitdir, 'LayoutTests')):
raise ValueError('No LayoutTest subdir inside %s' % webkitdir)
def test_or_expectations_modified_in_range(self, test_path):
curdir = os.getcwd()
os.chdir(self.webkitdir)
try:
# If the test has subtests use as path the main_file here. Also for searching the string into TestExpectations below.
test_path = test_path.split('?')[0]
# First populate the array with the usual names, because the test or the expectation can have been removed so better to not rely on the file beeing stil there.
paths_to_check = [ os.path.join('LayoutTests', test_path) ]
suffix = test_path[test_path.rfind('.'):]
test_path_without_suffix = test_path[:test_path.rfind(suffix) ]
paths_to_check.append(os.path.join('LayoutTests', test_path_without_suffix + '-expected.txt'))
paths_to_check.append(os.path.join('LayoutTests', test_path_without_suffix + '-expected' + suffix))
for platform in ['gtk','gtk-wayland', 'gtk4', 'wpe', 'glib', 'wk2', 'mac', 'mac-wk2', 'ios', 'ios-wk2']:
paths_to_check.append(os.path.join('LayoutTests', 'platform', platform, test_path_without_suffix + '-expected.txt'))
paths_to_check.append(os.path.join('LayoutTests', 'platform', platform, test_path_without_suffix + '-expected' + suffix))
# Now search the FS for matched expectations
for dirpath, dirs, files in os.walk('LayoutTests'):
for file in files:
path_file = os.path.join(dirpath, file)
if (test_path_without_suffix + '-expected') in path_file:
if path_file not in paths_to_check:
paths_to_check.append(path_file)
# 1. Check if any of this files was modified in the interval
gitcmd = ['git', 'log', '-1', '%s..%s' % (self.start_git_rev, self.end_git_rev), '--']
gitcmd.extend(paths_to_check)
outgit = None
errgit = None
gitreturncode = None
pgit = subprocess.Popen(gitcmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.webkitdir)
(outgit, errgit) = pgit.communicate()
gitreturncode = pgit.returncode
if gitreturncode == 0 and len(errgit) == 0:
if len(outgit) > 0:
print ('# Skipping to mark test "%s" as flaky because it was modified on the interval checked' %test_path)
return True
# 2. Check if the test name was modified inside the TestExpectation files on the interval
expectation_files = []
for dirpath, dirs, files in os.walk('LayoutTests'):
for file in files:
if file == 'TestExpectations':
expectation_files.append(os.path.join(dirpath, file))
assert(len(expectation_files) > 4)
gitcmd = ['git', 'log', '-S', test_path, '-1', '%s..%s' % (self.start_git_rev, self.end_git_rev), '--']
gitcmd.extend(expectation_files)
outgit = None
errgit = None
gitreturncode = None
pgit = subprocess.Popen(gitcmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(outgit, errgit) = pgit.communicate()
gitreturncode = pgit.returncode
if gitreturncode == 0 and len(errgit) == 0:
if len(outgit) > 0:
print ("# [GIT] Detected test path %s modified inside TestExpectation files" % test_path)
return True
return False
finally:
os.chdir(curdir)
def idrev_to_githash(self, id_rev):
gitcmd = ['git', 'log', '-1', '--grep', 'Canonical link: https://commits.webkit.org/%s@main' % id_rev, '--pretty=format:%H']
pgit = subprocess.Popen(gitcmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.webkitdir)
(outgit, errgit) = pgit.communicate()
gitreturncode = pgit.returncode
if gitreturncode == 0 and len(errgit) == 0:
githash = outgit.decode('utf8').strip()
if len(githash) > 3:
return githash
raise RuntimeError('Something unexpected happened with the git cmd when trying to find the git hash for identifier %s@main".\nThe git command returned status %s output was "%s" and stderr was "%s"\nTry to checkout the main branch and pull.' %(id_rev, gitreturncode, outgit, errgit))
def set_idrange_search(self, start_rev, end_rev):
self.start_git_rev = self.idrev_to_githash(start_rev)
self.end_git_rev = self.idrev_to_githash(end_rev)
def calculate_flakiness_factor(results):
flakiness_factor = 0
prev_result = results[0]
for result in results:
# If in a run it got two results means it was already flaky in that run
if ' ' in result:
flakiness_factor += 1
if result != prev_result:
flakiness_factor += 1
prev_result = result
return get_percent(flakiness_factor, len(results))
def translate_individual_expectation(expectation):
assert(isinstance(expectation, str))
if expectation in ['FAIL', 'TEXT']: return 'Failure'
if expectation == 'TIMEOUT': return 'Timeout'
if expectation == 'PASS': return 'Pass'
if expectation == 'IMAGE': return 'ImageOnlyFailure'
if expectation == 'MISSING': return 'Missing'
if expectation == 'CRASH': return 'Crash'
if expectation == 'SKIP': return 'Skip'
raise NotImplementedError('Implement translation for expectation %s' % expectation)
def get_str_translated_unique_expectations(expectations):
if isinstance(expectations, str):
expectations = [ expectations ]
assert(isinstance(expectations, list))
translated_expectations_set = set()
for test_result in expectations:
# subresults can be separated by spaces or '+'
for subresult in test_result.replace('+', ' ').split(' '):
translated_expectations_set.add(translate_individual_expectation(subresult))
return ' '.join(sorted(translated_expectations_set))
def flaky_hunter(bot_key_name, number_of_runs, processes, debug, usecolor, printprogress, args_min_percent, args_max_percent, args_flakiness, webkitdir):
maybe_color = OptionalColorText(usecolor)
print("Analyzing test results for the last %s runs of the bot %s..." %(number_of_runs, bot_key_name))
if 'skip-failing' in bot_key_name:
print(maybe_color.red('Results for "skip-failing" are incomplete because this bot skips tests already marked. So this will only report entries to add (not to modify).'))
results_parser = ResultsParser(bot_key_name, processes, None, printprogress, debug)
revisions_with_runs, tests = results_parser.get_list_of_failed_tests_at_last_n_runs(number_of_runs)
start_rev = int(revisions_with_runs[0])
end_rev = int(revisions_with_runs[-1])
assert (len(revisions_with_runs) == number_of_runs)
print("Got %s test results for %s runs in the range [%d@main-%d@main]" %(len(tests), number_of_runs, start_rev, end_rev))
initial_number_tests = len(tests)
tests_to_check = {}
expected_result_for_tests = {}
actual_results_for_tests = {}
flakiness_factor_for_tests = {}
revisions_for_results_for_tests = {}
# Select early only the tests that will be passed to git since that is really slow
for test in tests:
number_of_unexpected_results = 0
expectations_changed = False
for revision in tests[test]:
if test in expected_result_for_tests:
if expected_result_for_tests[test] != tests[test][revision]['expected']:
expectations_changed = True # ignore tests were the expected value changed along the runs examined (this indicates someone tunned the expectations)
else:
expected_result_for_tests[test] = tests[test][revision]['expected']
if ResultsInterpreter.is_result_for_test_unexpected(tests[test][revision]):
number_of_unexpected_results += 1
if expectations_changed:
print("# Skipping test %s because the value of the expected results was modified in the range [%d@main-%d@main]" %(test, start_rev, end_rev))
continue
if number_of_unexpected_results == 0:
print("# Skipping test %s because all the results matched the expectation in range [%d@main-%d@main]" %(test, start_rev, end_rev))
continue
percent_failed_expectation = get_percent(number_of_unexpected_results, number_of_runs)
assert (percent_failed_expectation >= 0)
assert (percent_failed_expectation <= 100)
if percent_failed_expectation < args_min_percent :
print("# Skipping test %s because it failed %s%% times in the range [%d@main-%d@main] and --min is %s%%" %(test, percent_failed_expectation, start_rev, end_rev, args_min_percent))
continue
if percent_failed_expectation > args_max_percent :
print("# Skipping test %s because it failed %s%% times in the range [%d@main-%d@main] and --max is %s%%" %(test, percent_failed_expectation, start_rev, end_rev, args_max_percent))
continue
assert(test not in actual_results_for_tests)
actual_results_for_tests[test] = []
revisions_for_results_for_tests[test] = {}
for revision in revisions_with_runs:
if revision in tests[test]:
test_result = tests[test][revision]['result']
else:
# Assume that no reported entries means either test passed or expectations matched
test_result = expected_result_for_tests[test]
actual_results_for_tests[test].append(test_result)
if test_result not in revisions_for_results_for_tests[test]:
revisions_for_results_for_tests[test][test_result] = []
revisions_for_results_for_tests[test][test_result].append("%s@main" % revision)
assert(len(actual_results_for_tests[test]) == number_of_runs)
assert(test not in flakiness_factor_for_tests)
test_always_passes = False
different_result_for_test = set()
for result in actual_results_for_tests[test]:
for subresult in result.split(' '):
different_result_for_test.add(subresult)
if len(different_result_for_test) == 1:
if list(different_result_for_test)[0] == "PASS":
test_always_passes = True
else:
print("# Skipping test %s because it always gave the same result \"%s\" in the range [%d@main-%d@main]. This is a constant failure and not a flaky" %(test, list(different_result_for_test)[0], start_rev, end_rev))
continue
flakiness_factor_for_tests[test] = calculate_flakiness_factor (actual_results_for_tests[test])
# Skip the test if test flakiness is lower than selected, but let always-pass tests so we can later tell which flaky expectations should be removed
if args_flakiness > flakiness_factor_for_tests[test]:
if not test_always_passes:
print("# Skipping test %s because it has a flakiness factor of %s%% in the range [%d@main-%d@main] and --flakiness is %s%%" %(test, flakiness_factor_for_tests[test], start_rev, end_rev, args_flakiness))
continue
# If there is only result but is pass we keep it to later print "Passing always in the range ... Please check further in the range and maybe remove expectations"
assert(len(different_result_for_test) > 1 or (len(different_result_for_test) == 1 and list(different_result_for_test)[0] == "PASS"))
tests_to_check[test] = tests[test]
# Likely lot of tests are filtered now, so calling git on them for further checking is more cheap.
tests = tests_to_check
if webkitdir:
print(maybe_color.bold("# Getting info from git for %s tests in the revision range [%d@main-%d@main]. This can be slow, please wait." %(len(tests), start_rev, end_rev)))
local_git_webkit = LocalGitWebKitRepoInfoChecker(webkitdir)
local_git_webkit.set_idrange_search(start_rev, end_rev)
pool = multiprocessing.Pool(processes=processes)
modified_tests_index_in_range = pool.map(local_git_webkit.test_or_expectations_modified_in_range, tests)
# Filter tests further with the git data
tests_to_check = {}
id_test = -1
for test in tests:
id_test += 1
if modified_tests_index_in_range[id_test]:
print("# Skipping test %s because GIT reported modifications related to it in the range [%d@main-%d@main]" %(test, start_rev, end_rev))
continue
tests_to_check[test] = tests[test]
tests = tests_to_check
else:
modified_tests_index_in_range = [ False for test in tests ]
print(maybe_color.red("No gathering info about modified tests in the revision range [%d@main-%d@main]. Results will be less fiable. Please pass --webkitdir /path/to/webkit/gitrepo to enable." %(start_rev, end_rev)))
print(maybe_color.bold("Reporting %s tests with unexpected results from an initial set of %s tests for %s runs in the interval [%d@main-%d@main]" %(len(tests), initial_number_tests, number_of_runs, start_rev, end_rev)))
report_to_add_new_expectations_text = ""
report_to_modify_expectations_text = ""
number_of_expectations_to_add = 0
number_of_expectations_to_modify = 0
# We have now in tests the final list of filtered tests. Report the values.
for test in tests:
# Report new expectations
if expected_result_for_tests[test] == 'PASS':
number_of_expectations_to_add += 1
desired_expectations = set()
for result in actual_results_for_tests[test]:
for subresult in result.split(' '):
desired_expectations.add(subresult)
report_to_add_new_expectations_text += ("%s [ %s ]\n" %(maybe_color.bold(test), maybe_color.bold(get_str_translated_unique_expectations(sorted(desired_expectations)))))
if flakiness_factor_for_tests[test] > 9:
report_to_add_new_expectations_text += maybe_color.red("# Flakiness factor: %s%%\n" % flakiness_factor_for_tests[test])
else:
report_to_add_new_expectations_text += maybe_color.bold("# Flakiness factor: %s%%\n" % flakiness_factor_for_tests[test])
counted_test_results = Counter(actual_results_for_tests[test])
for test_result in counted_test_results:
str_revisions_failed = ''
if test_result in revisions_for_results_for_tests[test]:
if len(revisions_for_results_for_tests[test][test_result]) > 0:
str_revisions_failed = '[ %s ]' % ' '.join(revisions_for_results_for_tests[test][test_result])
report_to_add_new_expectations_text += ("# {:<22} -> {} {}\n".format("%d times [%s]" % (counted_test_results[test_result],get_percent_str(counted_test_results[test_result], number_of_runs)), test_result, str_revisions_failed))
report_to_add_new_expectations_text += ("\n")
# Report expectations to be modified
else:
number_of_expectations_to_modify += 1
desired_expectations = set()
for result in actual_results_for_tests[test]:
for subresult in result.split(' '):
desired_expectations.add(subresult.strip())
if list(desired_expectations) == ['PASS']:
report_to_modify_expectations_text +="\n%s\n# Passing always in the range [%d@main-%d@main]. Please check further in the range and maybe remove expectations\n\n" %(maybe_color.bold(test), start_rev, end_rev)
continue
report_to_modify_expectations_text += maybe_color.red("-%s [ %s ]\n" %(test, get_str_translated_unique_expectations(expected_result_for_tests[test])))
report_to_modify_expectations_text += maybe_color.green("+%s [ %s ]\n" %(test, get_str_translated_unique_expectations(sorted(desired_expectations))))
if flakiness_factor_for_tests[test] > 9:
report_to_modify_expectations_text += maybe_color.red("# Flakiness factor: %s%%\n" % flakiness_factor_for_tests[test])
else:
report_to_modify_expectations_text += maybe_color.bold("# Flakiness factor: %s%%\n" % flakiness_factor_for_tests[test])
counted_test_results = Counter(actual_results_for_tests[test])
for test_result in counted_test_results:
str_revisions_failed = ''
if test_result in revisions_for_results_for_tests[test]:
if len(revisions_for_results_for_tests[test][test_result]) > 0:
str_revisions_failed = '[ %s ]' % ' '.join(revisions_for_results_for_tests[test][test_result])
report_to_modify_expectations_text += ("# {:<22} -> {} {}\n".format("%d times [%s]" % (counted_test_results[test_result],get_percent_str(counted_test_results[test_result], number_of_runs)), test_result, str_revisions_failed))
report_to_modify_expectations_text += ("\n")
if len(report_to_add_new_expectations_text) > 2:
print("\n\n")
print(maybe_color.bold("###############################################################"))
print(maybe_color.bold("################### %s EXPECTATIONS TO ADD ####################" % number_of_expectations_to_add))
print(maybe_color.bold("###############################################################\n"))
print(report_to_add_new_expectations_text)
if len(report_to_modify_expectations_text) > 2:
print("\n\n")
print(maybe_color.bold("###############################################################"))
print(maybe_color.bold("################## %s EXPECTATIONS TO MODIFY ##################" % number_of_expectations_to_modify))
print(maybe_color.bold("###############################################################\n"))
print(report_to_modify_expectations_text)
if not webkitdir:
print(maybe_color.bold('# NOTE: To make this results more reliable pass "--webkitdir /path/to/your/webkit/checkout" to this script'))
if __name__ == '__main__':
import multiprocessing as mp
parser = argparse.ArgumentParser()
parser.add_argument("--debug", help="Print debug messages.", action="store_true")
parser.add_argument("--nocolor", help="Don't print colors", action="store_true")
parser.add_argument("--min", help="Only show tests with unexpected results higher than in MIN%% of the runs. Default is 10%%", type=float, default=10)
parser.add_argument("--max", help="Don't show tests with unexpected results lower than in MAX%% of the runs. Default is 100%%", type=float, default=100)
parser.add_argument("--flakiness", help="Don't show tests with a flakiness factor lower than FLAKINESS%%. Flakiness factor is defined as the percent of times the result changed from one run to the next one. Default is 5%%", type=float, default=5)
parser.add_argument("--webkitdir", help="Path to the local git repository of WebKit. Will enable checking local history to discard tests modified in the interval")
parser.add_argument("--noprintprogress", help="Don't print percentage progress of computations..", action="store_true")
parser.add_argument("--bot", help="Check the flakies for a specific bot. Default is: %(default)s", choices=list(bots.keys()), default="gtk-release")
parser.add_argument("-j", type=int, help="Number of processes to use. Default is the number of cores of this machine.", default=mp.cpu_count())
parser.add_argument('--depth', help='Search in only the last X revisions known for bot. Default is 200.', default=200)
args = parser.parse_args()
if args.noprintprogress: args_printprogress = False
else: args_printprogress = sys.stderr.isatty()
# Exit inmediately on CTRL+C or broken-pipe situations.
signal.signal(signal.SIGINT, sigterm_this_process)
signal.signal(signal.SIGPIPE, sigterm_this_process)
if args.min < 0 or args.min > 100:
raise ValueError('--min should be a value between 0 and 100')
if args.max < 0 or args.max > 100:
raise ValueError('--max should be a value between 0 and 100')
if args.min >= args.max:
raise ValueError('--min should be a lower value than --max')
if args.flakiness < 0 or args.flakiness > 100:
raise ValueError('--flakiness should be a value between 0 and 100')
flaky_hunter(args.bot, int(args.depth), int(args.j), args.debug, not args.nocolor, args_printprogress, float(args.min), float(args.max), float(args.flakiness), args.webkitdir)