Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Profiler] Fix graph_executor_debug hang #12382

Merged
merged 3 commits into from
Aug 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions include/tvm/runtime/profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,8 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
* minimum duration requirement of one `repeat`.
* i.e., When the run time of one `repeat` falls below this time,
* the `number` parameter will be automatically increased.
* \param limit_zero_time_iterations The maximum number of repeats when
* measured time is equal to 0. It helps to avoid hanging during measurements.
* \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
* defined by `repeats_to_cooldown`.
* \param repeats_to_cooldown The number of repeats before the
Expand All @@ -582,8 +584,8 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
* \return f_timer A timer function.
*/
PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
int cooldown_interval_ms, int repeats_to_cooldown,
PackedFunc f_preproc = nullptr);
int limit_zero_time_iterations, int cooldown_interval_ms,
int repeats_to_cooldown, PackedFunc f_preproc = nullptr);

} // namespace profiling
} // namespace runtime
Expand Down
49 changes: 45 additions & 4 deletions python/tvm/contrib/debugger/debug_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,15 @@ def _run_per_layer(self):
output_tensors.append(self._get_node_output(i, j))
self.debug_datum.update_output_tensors(output_tensors)

def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown):
def _run_debug(
self,
number,
repeat,
min_repeat_ms,
limit_zero_time_iterations,
cooldown_interval_ms,
repeats_to_cooldown,
):
"""Execute the node specified with index will be executed.
Each debug output will be copied to the buffer
Time consumed for each execution will be set as debug output.
Expand All @@ -233,6 +241,7 @@ def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeat
number=number,
repeat=repeat,
min_repeat_ms=min_repeat_ms,
limit_zero_time_iterations=limit_zero_time_iterations,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)
Expand Down Expand Up @@ -272,6 +281,7 @@ def run(
number=10,
repeat=1,
min_repeat_ms=1,
limit_zero_time_iterations=100,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
**input_dict,
Expand Down Expand Up @@ -299,6 +309,10 @@ def run(
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.

limit_zero_time_iterations: int, optional
The maximum number of repeats when measured time is equal to 0.
It helps to avoid hanging during measurements.

cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
Expand All @@ -317,6 +331,7 @@ def run(
number=number,
repeat=repeat,
min_repeat_ms=min_repeat_ms,
limit_zero_time_iterations=limit_zero_time_iterations,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)
Expand All @@ -328,7 +343,13 @@ def run(
self.debug_datum.display_debug_result()

def run_individual(
self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1
self,
number,
repeat=1,
min_repeat_ms=0,
limit_zero_time_iterations=100,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
):
"""Run each operation in the graph and get the time per op for all ops.

Expand All @@ -351,6 +372,10 @@ def run_individual(
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.

limit_zero_time_iterations: int, optional
The maximum number of repeats when measured time is equal to 0.
It helps to avoid hanging during measurements.

cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
Expand All @@ -364,7 +389,12 @@ def run_individual(
the repeat of the measurement.
"""
res = self._run_individual(
number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
number,
repeat,
min_repeat_ms,
limit_zero_time_iterations,
cooldown_interval_ms,
repeats_to_cooldown,
)
results = []
offset = 0
Expand All @@ -384,6 +414,7 @@ def run_individual_node(
number=10,
repeat=1,
min_repeat_ms=0,
limit_zero_time_iterations=100,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
):
Expand Down Expand Up @@ -415,6 +446,10 @@ def run_individual_node(
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.

limit_zero_time_iterations: int, optional
The maximum number of repeats when measured time is equal to 0.
It helps to avoid hanging during measurements.

cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
Expand All @@ -428,7 +463,13 @@ def run_individual_node(
"""
# Results are returned as serialized strings which we deserialize
res = self._run_individual_node(
index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
index,
number,
repeat,
min_repeat_ms,
limit_zero_time_iterations,
cooldown_interval_ms,
repeats_to_cooldown,
)
fmt = "@" + ("d" * repeat)
results = struct.unpack(fmt, res)
Expand Down
7 changes: 7 additions & 0 deletions python/tvm/contrib/graph_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ def benchmark(
repeat=5,
number=5,
min_repeat_ms=None,
limit_zero_time_iterations=100,
end_to_end=False,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
Expand Down Expand Up @@ -402,6 +403,10 @@ def benchmark(
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.

limit_zero_time_iterations : Optional[int]
The maximum number of repeats when measured time is equal to 0.
It helps to avoid hanging during measurements.

end_to_end : bool
If set, include time to transfer input tensors to the device and time to transfer
returned tensors in the total runtime. This will give accurate timings for end to end
Expand Down Expand Up @@ -437,6 +442,7 @@ def benchmark(
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
limit_zero_time_iterations=limit_zero_time_iterations,
)(device.device_type % rpc_base.RPC_SESS_MASK, device.device_id, *args)
if kwargs:
self.set_input(**kwargs)
Expand All @@ -446,6 +452,7 @@ def benchmark(
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
limit_zero_time_iterations=limit_zero_time_iterations,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)()
6 changes: 6 additions & 0 deletions python/tvm/runtime/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ def time_evaluator(
number=10,
repeat=1,
min_repeat_ms=0,
limit_zero_time_iterations=100,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
f_preproc="",
Expand Down Expand Up @@ -310,6 +311,10 @@ def time_evaluator(
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.

limit_zero_time_iterations: int, optional
The maximum number of repeats when measured time is equal to 0.
It helps to avoid hanging during measurements.

cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
Expand Down Expand Up @@ -340,6 +345,7 @@ def time_evaluator(
number,
repeat,
min_repeat_ms,
limit_zero_time_iterations,
cooldown_interval_ms,
repeats_to_cooldown,
f_preproc,
Expand Down
7 changes: 7 additions & 0 deletions python/tvm/runtime/vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,7 @@ def benchmark(
repeat=5,
number=5,
min_repeat_ms=None,
limit_zero_time_iterations=100,
end_to_end=False,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
Expand Down Expand Up @@ -630,6 +631,10 @@ def benchmark(
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.

limit_zero_time_iterations : Optional[int]
The maximum number of repeats when measured time is equal to 0.
It helps to avoid hanging during measurements.

end_to_end : bool
If set, include time to transfer input tensors to the device and time to transfer
returned tensors in the total runtime. This will give accurate timings for end to end
Expand Down Expand Up @@ -672,6 +677,7 @@ def benchmark(
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
limit_zero_time_iterations=limit_zero_time_iterations,
)(func_name, device.device_type % RPC_SESS_MASK, device.device_id, *packed_args)
if args or kwargs:
self.set_input(func_name, *args, **kwargs)
Expand All @@ -681,6 +687,7 @@ def benchmark(
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
limit_zero_time_iterations=limit_zero_time_iterations,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)(func_name)
16 changes: 11 additions & 5 deletions src/runtime/crt/common/crt_runtime_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include <assert.h>
#include <inttypes.h>
#include <math.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
Expand Down Expand Up @@ -477,6 +478,7 @@ typedef struct {
int number;
int repeat;
int min_repeat_ms;
int limit_zero_time_iterations;
int cooldown_interval_ms;
int repeats_to_cooldown;
} time_evaluator_state_t;
Expand All @@ -487,14 +489,14 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
int* ret_type_code) {
ret_val[0].v_handle = NULL;
ret_type_code[0] = kTVMNullptr;
if (num_args < 10) {
if (num_args < 11) {
TVMAPIErrorf("not enough args");
return kTvmErrorFunctionCallNumArguments;
}
if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr ||
type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt ||
type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMArgInt ||
type_codes[8] != kTVMArgInt || type_codes[9] != kTVMStr) {
type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMStr) {
TVMAPIErrorf("one or more invalid arg types");
return kTvmErrorFunctionCallWrongArgType;
}
Expand All @@ -506,8 +508,9 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
g_time_evaluator_state.number = args[4].v_int64;
g_time_evaluator_state.repeat = args[5].v_int64;
g_time_evaluator_state.min_repeat_ms = args[6].v_int64;
g_time_evaluator_state.cooldown_interval_ms = args[7].v_int64;
g_time_evaluator_state.repeats_to_cooldown = args[8].v_int64;
g_time_evaluator_state.limit_zero_time_iterations = args[7].v_int64;
g_time_evaluator_state.cooldown_interval_ms = args[8].v_int64;
g_time_evaluator_state.repeats_to_cooldown = args[9].v_int64;

int ret_code =
TVMModGetFunction(mod, name, /* query_imports */ 0, &g_time_evaluator_state.func_to_time);
Expand Down Expand Up @@ -556,6 +559,7 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
double* iter = (double*)result_byte_arr->data;
for (int i = 0; i < g_time_evaluator_state.repeat; i++) {
double curr_res_seconds = 0.0;
int absolute_zero_times = 0;
// do-while structure ensures we run even when `min_repeat_ms` isn't set (i.e., is 0).
do {
if (curr_res_seconds > 0.0) {
Expand Down Expand Up @@ -588,7 +592,9 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
if (err != kTvmErrorNoError) {
goto release_and_return;
}
} while (curr_res_seconds < min_repeat_seconds);
if (fpclassify(curr_res_seconds) == FP_ZERO) absolute_zero_times++;
} while (curr_res_seconds < min_repeat_seconds &&
absolute_zero_times < g_time_evaluator_state.limit_zero_time_iterations);
double mean_exec_seconds = curr_res_seconds / g_time_evaluator_state.number;
*iter = mean_exec_seconds;
iter++;
Expand Down
Loading