diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h index 5f6f36e3b279..3922ef76dfb7 100644 --- a/include/tvm/runtime/profiling.h +++ b/include/tvm/runtime/profiling.h @@ -573,6 +573,8 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i * minimum duration requirement of one `repeat`. * i.e., When the run time of one `repeat` falls below this time, * the `number` parameter will be automatically increased. + * \param limit_zero_time_iterations The maximum number of repeats when + * measured time is equal to 0. It helps to avoid hanging during measurements. * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats * defined by `repeats_to_cooldown`. * \param repeats_to_cooldown The number of repeats before the @@ -582,8 +584,8 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i * \return f_timer A timer function. */ PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms, - int cooldown_interval_ms, int repeats_to_cooldown, - PackedFunc f_preproc = nullptr); + int limit_zero_time_iterations, int cooldown_interval_ms, + int repeats_to_cooldown, PackedFunc f_preproc = nullptr); } // namespace profiling } // namespace runtime diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py index 5ce378965246..8160fa96b8ee 100644 --- a/python/tvm/contrib/debugger/debug_executor.py +++ b/python/tvm/contrib/debugger/debug_executor.py @@ -223,7 +223,15 @@ def _run_per_layer(self): output_tensors.append(self._get_node_output(i, j)) self.debug_datum.update_output_tensors(output_tensors) - def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown): + def _run_debug( + self, + number, + repeat, + min_repeat_ms, + limit_zero_time_iterations, + cooldown_interval_ms, + repeats_to_cooldown, + ): """Execute the node specified with index will be executed. Each debug output will be copied to the buffer Time consumed for each execution will be set as debug output. @@ -233,6 +241,7 @@ def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeat number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, + limit_zero_time_iterations=limit_zero_time_iterations, cooldown_interval_ms=cooldown_interval_ms, repeats_to_cooldown=repeats_to_cooldown, ) @@ -272,6 +281,7 @@ def run( number=10, repeat=1, min_repeat_ms=1, + limit_zero_time_iterations=100, cooldown_interval_ms=0, repeats_to_cooldown=1, **input_dict, @@ -299,6 +309,10 @@ def run( i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. + limit_zero_time_iterations: int, optional + The maximum number of repeats when measured time is equal to 0. + It helps to avoid hanging during measurements. + cooldown_interval_ms: int, optional The cooldown interval in milliseconds between the number of repeats defined by `repeats_to_cooldown`. @@ -317,6 +331,7 @@ def run( number=number, repeat=repeat, min_repeat_ms=min_repeat_ms, + limit_zero_time_iterations=limit_zero_time_iterations, cooldown_interval_ms=cooldown_interval_ms, repeats_to_cooldown=repeats_to_cooldown, ) @@ -328,7 +343,13 @@ def run( self.debug_datum.display_debug_result() def run_individual( - self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1 + self, + number, + repeat=1, + min_repeat_ms=0, + limit_zero_time_iterations=100, + cooldown_interval_ms=0, + repeats_to_cooldown=1, ): """Run each operation in the graph and get the time per op for all ops. @@ -351,6 +372,10 @@ def run_individual( i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. + limit_zero_time_iterations: int, optional + The maximum number of repeats when measured time is equal to 0. + It helps to avoid hanging during measurements. + cooldown_interval_ms: int, optional The cooldown interval in milliseconds between the number of repeats defined by `repeats_to_cooldown`. @@ -364,7 +389,12 @@ def run_individual( the repeat of the measurement. """ res = self._run_individual( - number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown + number, + repeat, + min_repeat_ms, + limit_zero_time_iterations, + cooldown_interval_ms, + repeats_to_cooldown, ) results = [] offset = 0 @@ -384,6 +414,7 @@ def run_individual_node( number=10, repeat=1, min_repeat_ms=0, + limit_zero_time_iterations=100, cooldown_interval_ms=0, repeats_to_cooldown=1, ): @@ -415,6 +446,10 @@ def run_individual_node( i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. + limit_zero_time_iterations: int, optional + The maximum number of repeats when measured time is equal to 0. + It helps to avoid hanging during measurements. + cooldown_interval_ms: int, optional The cooldown interval in milliseconds between the number of repeats defined by `repeats_to_cooldown`. @@ -428,7 +463,13 @@ def run_individual_node( """ # Results are returned as serialized strings which we deserialize res = self._run_individual_node( - index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown + index, + number, + repeat, + min_repeat_ms, + limit_zero_time_iterations, + cooldown_interval_ms, + repeats_to_cooldown, ) fmt = "@" + ("d" * repeat) results = struct.unpack(fmt, res) diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py index a4b90baf1d38..08dae307a89e 100644 --- a/python/tvm/contrib/graph_executor.py +++ b/python/tvm/contrib/graph_executor.py @@ -355,6 +355,7 @@ def benchmark( repeat=5, number=5, min_repeat_ms=None, + limit_zero_time_iterations=100, end_to_end=False, cooldown_interval_ms=0, repeats_to_cooldown=1, @@ -402,6 +403,10 @@ def benchmark( milliseconds. This can be used to ensure that the function is run enough to get an accurate measurement. + limit_zero_time_iterations : Optional[int] + The maximum number of repeats when measured time is equal to 0. + It helps to avoid hanging during measurements. + end_to_end : bool If set, include time to transfer input tensors to the device and time to transfer returned tensors in the total runtime. This will give accurate timings for end to end @@ -437,6 +442,7 @@ def benchmark( repeat=repeat, number=number, min_repeat_ms=min_repeat_ms, + limit_zero_time_iterations=limit_zero_time_iterations, )(device.device_type % rpc_base.RPC_SESS_MASK, device.device_id, *args) if kwargs: self.set_input(**kwargs) @@ -446,6 +452,7 @@ def benchmark( repeat=repeat, number=number, min_repeat_ms=min_repeat_ms, + limit_zero_time_iterations=limit_zero_time_iterations, cooldown_interval_ms=cooldown_interval_ms, repeats_to_cooldown=repeats_to_cooldown, )() diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py index e2af556413b4..e85b99234100 100644 --- a/python/tvm/runtime/module.py +++ b/python/tvm/runtime/module.py @@ -277,6 +277,7 @@ def time_evaluator( number=10, repeat=1, min_repeat_ms=0, + limit_zero_time_iterations=100, cooldown_interval_ms=0, repeats_to_cooldown=1, f_preproc="", @@ -310,6 +311,10 @@ def time_evaluator( i.e., When the run time of one `repeat` falls below this time, the `number` parameter will be automatically increased. + limit_zero_time_iterations: int, optional + The maximum number of repeats when measured time is equal to 0. + It helps to avoid hanging during measurements. + cooldown_interval_ms: int, optional The cooldown interval in milliseconds between the number of repeats defined by `repeats_to_cooldown`. @@ -340,6 +345,7 @@ def time_evaluator( number, repeat, min_repeat_ms, + limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown, f_preproc, diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py index 83f1656a0dd8..c065d77a7c9f 100644 --- a/python/tvm/runtime/vm.py +++ b/python/tvm/runtime/vm.py @@ -583,6 +583,7 @@ def benchmark( repeat=5, number=5, min_repeat_ms=None, + limit_zero_time_iterations=100, end_to_end=False, cooldown_interval_ms=0, repeats_to_cooldown=1, @@ -630,6 +631,10 @@ def benchmark( milliseconds. This can be used to ensure that the function is run enough to get an accurate measurement. + limit_zero_time_iterations : Optional[int] + The maximum number of repeats when measured time is equal to 0. + It helps to avoid hanging during measurements. + end_to_end : bool If set, include time to transfer input tensors to the device and time to transfer returned tensors in the total runtime. This will give accurate timings for end to end @@ -672,6 +677,7 @@ def benchmark( repeat=repeat, number=number, min_repeat_ms=min_repeat_ms, + limit_zero_time_iterations=limit_zero_time_iterations, )(func_name, device.device_type % RPC_SESS_MASK, device.device_id, *packed_args) if args or kwargs: self.set_input(func_name, *args, **kwargs) @@ -681,6 +687,7 @@ def benchmark( repeat=repeat, number=number, min_repeat_ms=min_repeat_ms, + limit_zero_time_iterations=limit_zero_time_iterations, cooldown_interval_ms=cooldown_interval_ms, repeats_to_cooldown=repeats_to_cooldown, )(func_name) diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c index 23ab5af08a7f..2151c23f8462 100644 --- a/src/runtime/crt/common/crt_runtime_api.c +++ b/src/runtime/crt/common/crt_runtime_api.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -477,6 +478,7 @@ typedef struct { int number; int repeat; int min_repeat_ms; + int limit_zero_time_iterations; int cooldown_interval_ms; int repeats_to_cooldown; } time_evaluator_state_t; @@ -487,14 +489,14 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re int* ret_type_code) { ret_val[0].v_handle = NULL; ret_type_code[0] = kTVMNullptr; - if (num_args < 10) { + if (num_args < 11) { TVMAPIErrorf("not enough args"); return kTvmErrorFunctionCallNumArguments; } if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr || type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt || type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMArgInt || - type_codes[8] != kTVMArgInt || type_codes[9] != kTVMStr) { + type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMStr) { TVMAPIErrorf("one or more invalid arg types"); return kTvmErrorFunctionCallWrongArgType; } @@ -506,8 +508,9 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re g_time_evaluator_state.number = args[4].v_int64; g_time_evaluator_state.repeat = args[5].v_int64; g_time_evaluator_state.min_repeat_ms = args[6].v_int64; - g_time_evaluator_state.cooldown_interval_ms = args[7].v_int64; - g_time_evaluator_state.repeats_to_cooldown = args[8].v_int64; + g_time_evaluator_state.limit_zero_time_iterations = args[7].v_int64; + g_time_evaluator_state.cooldown_interval_ms = args[8].v_int64; + g_time_evaluator_state.repeats_to_cooldown = args[9].v_int64; int ret_code = TVMModGetFunction(mod, name, /* query_imports */ 0, &g_time_evaluator_state.func_to_time); @@ -556,6 +559,7 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* double* iter = (double*)result_byte_arr->data; for (int i = 0; i < g_time_evaluator_state.repeat; i++) { double curr_res_seconds = 0.0; + int absolute_zero_times = 0; // do-while structure ensures we run even when `min_repeat_ms` isn't set (i.e., is 0). do { if (curr_res_seconds > 0.0) { @@ -588,7 +592,9 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* if (err != kTvmErrorNoError) { goto release_and_return; } - } while (curr_res_seconds < min_repeat_seconds); + if (fpclassify(curr_res_seconds) == FP_ZERO) absolute_zero_times++; + } while (curr_res_seconds < min_repeat_seconds && + absolute_zero_times < g_time_evaluator_state.limit_zero_time_iterations); double mean_exec_seconds = curr_res_seconds / g_time_evaluator_state.number; *iter = mean_exec_seconds; iter++; diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc index cf98141037b7..ba546165c6a0 100644 --- a/src/runtime/graph_executor/debug/graph_executor_debug.cc +++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc @@ -56,6 +56,9 @@ class GraphExecutorDebug : public GraphExecutor { * By default, one `repeat` contains `number` runs. If this parameter is set, * the parameters `number` will be dynamically adjusted to meet the * minimum duration requirement of one `repeat`. + * \param limit_zero_time_iterations The maximum number of repeats when + * measured time is equal to 0. It helps to avoid hanging during + * measurements. * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats * defined by `repeats_to_cooldown`. * \param repeats_to_cooldown The number of repeats before the @@ -64,7 +67,8 @@ class GraphExecutorDebug : public GraphExecutor { * representing the number of layers. Next the encoded real numbers are float32_t in the number of * repeat multiplied by the number of layers. */ - std::string RunIndividual(int number, int repeat, int min_repeat_ms, int cooldown_interval_ms, + std::string RunIndividual(int number, int repeat, int min_repeat_ms, + int limit_zero_time_iterations, int cooldown_interval_ms, int repeats_to_cooldown) { // warmup run GraphExecutor::Run(); @@ -73,14 +77,16 @@ class GraphExecutorDebug : public GraphExecutor { if (tkey == "rpc") { // RPC modules rely on remote timing which implements the logic from the else branch. for (size_t index = 0; index < op_execs_.size(); ++index) { - time_sec_per_op[index] = RunOpRPC(index, number, repeat, min_repeat_ms, - cooldown_interval_ms, repeats_to_cooldown); + time_sec_per_op[index] = + RunOpRPC(index, number, repeat, min_repeat_ms, limit_zero_time_iterations, + cooldown_interval_ms, repeats_to_cooldown); } } else { int op = 0; for (size_t index = 0; index < op_execs_.size(); ++index) { - std::string result_str = RunIndividualNode(index, number, repeat, min_repeat_ms, - cooldown_interval_ms, repeats_to_cooldown); + std::string result_str = + RunIndividualNode(index, number, repeat, min_repeat_ms, limit_zero_time_iterations, + cooldown_interval_ms, repeats_to_cooldown); const double* blob_ptr = reinterpret_cast(result_str.data()); for (int i = 0; i < repeat; ++i, ++blob_ptr) { time_sec_per_op[index].push_back(*blob_ptr); @@ -110,14 +116,15 @@ class GraphExecutorDebug : public GraphExecutor { } std::string RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms, - int cooldown_interval_ms, int repeats_to_cooldown) { + int limit_zero_time_iterations, int cooldown_interval_ms, + int repeats_to_cooldown) { std::string tkey = module_->type_key(); if (tkey == "rpc") { LOG(FATAL) << "RPC measurements should not use RunIndividualNode!"; } - if (!op_execs_[node_index] || nodes_[node_index].param.func_name == "__nop") { + if (!op_execs_[node_index]) { // don't return anything... std::ostringstream os; double zero = 0; @@ -131,12 +138,14 @@ class GraphExecutorDebug : public GraphExecutor { Device& d = devices_[0]; PackedFunc time_evaluator = profiling::WrapTimeEvaluator( TypedPackedFunc([this, node_index]() { this->RunOpHost(node_index); }), d, number, - repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown); + repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown); return time_evaluator(); } std::vector RunOpRPC(int index, int number, int repeat, int min_repeat_ms, - int cooldown_interval_ms, int repeats_to_cooldown) { + int limit_zero_time_iterations, int cooldown_interval_ms, + int repeats_to_cooldown) { std::vector results(repeat, 0); // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes // which represent inputs/parameters to the graph. Other types may be supported in the @@ -152,11 +161,6 @@ class GraphExecutorDebug : public GraphExecutor { return results; } - if (nodes_[index].param.func_name == "__nop") { - LOG_INFO << "Skipping __nop function"; - return results; - } - const Device& dev = data_entry_[entry_id(index, 0)]->device; TVMOpParam param = nodes_[index].param; std::string name = param.func_name; @@ -167,7 +171,8 @@ class GraphExecutorDebug : public GraphExecutor { runtime::Registry::Get("runtime.RPCTimeEvaluator") -> operator()(module_, name, static_cast(dev.device_type), dev.device_id, number, - repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, ""); + repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown, ""); int num_flat_args = num_inputs + num_outputs; std::unique_ptr values(new TVMValue[num_flat_args]); @@ -390,15 +395,18 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name, int number = args[0]; int repeat = args[1]; int min_repeat_ms = args[2]; - int cooldown_interval_ms = args[3]; - int repeats_to_cooldown = args[4]; + int limit_zero_time_iterations = args[3]; + int cooldown_interval_ms = args[4]; + int repeats_to_cooldown = args[5]; ICHECK_GT(number, 0); ICHECK_GT(repeat, 0); ICHECK_GE(min_repeat_ms, 0); + ICHECK_GE(limit_zero_time_iterations, 0); ICHECK_GE(cooldown_interval_ms, 0); ICHECK_GT(repeats_to_cooldown, 0); - std::string blob = this->RunIndividual(number, repeat, min_repeat_ms, cooldown_interval_ms, - repeats_to_cooldown); + std::string blob = + this->RunIndividual(number, repeat, min_repeat_ms, limit_zero_time_iterations, + cooldown_interval_ms, repeats_to_cooldown); TVMByteArray arr; arr.size = blob.length(); arr.data = blob.data(); @@ -410,17 +418,20 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name, int number = args[1]; int repeat = args[2]; int min_repeat_ms = args[3]; - int cooldown_interval_ms = args[4]; - int repeats_to_cooldown = args[5]; + int limit_zero_time_iterations = args[4]; + int cooldown_interval_ms = args[5]; + int repeats_to_cooldown = args[6]; ICHECK_GE(node_index, 0); ICHECK_LT(node_index, nodes_.size()); ICHECK_GT(number, 0); ICHECK_GT(repeat, 0); ICHECK_GE(min_repeat_ms, 0); + ICHECK_GE(limit_zero_time_iterations, 0); ICHECK_GE(cooldown_interval_ms, 0); ICHECK_GT(repeats_to_cooldown, 0); std::string blob = this->RunIndividualNode(node_index, number, repeat, min_repeat_ms, - cooldown_interval_ms, repeats_to_cooldown); + limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown); TVMByteArray arr; arr.size = blob.length(); arr.data = blob.data(); diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index 187a98964af2..2c92633c34fc 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -848,8 +848,8 @@ TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction") }); PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms, - int cooldown_interval_ms, int repeats_to_cooldown, - PackedFunc f_preproc) { + int limit_zero_time_iterations, int cooldown_interval_ms, + int repeats_to_cooldown, PackedFunc f_preproc) { ICHECK(pf != nullptr); if (static_cast(dev.device_type) == static_cast(kDLMicroDev)) { @@ -858,7 +858,8 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, return (*get_micro_time_evaluator)(pf, dev, number, repeat); } - auto ftimer = [pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, + auto ftimer = [pf, dev, number, repeat, min_repeat_ms, limit_zero_time_iterations, + cooldown_interval_ms, repeats_to_cooldown, f_preproc](TVMArgs args, TVMRetValue* rv) mutable { TVMRetValue temp; std::ostringstream os; @@ -872,7 +873,7 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, f_preproc.CallPacked(args, &temp); } double duration_ms = 0.0; - + int absolute_zero_times = 0; do { if (duration_ms > 0.0) { const double golden_ratio = 1.618; @@ -887,8 +888,9 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, } t->Stop(); int64_t t_nanos = t->SyncAndGetElapsedNanos(); + if (t_nanos == 0) absolute_zero_times++; duration_ms = t_nanos / 1e6; - } while (duration_ms < min_repeat_ms); + } while (duration_ms < min_repeat_ms && absolute_zero_times < limit_zero_time_iterations); double speed = duration_ms / 1e3 / number; os.write(reinterpret_cast(&speed), sizeof(speed)); diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index ff5889500592..a3f41e063226 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -191,7 +191,8 @@ class RPCModuleNode final : public ModuleNode { } PackedFunc GetTimeEvaluator(const std::string& name, Device dev, int number, int repeat, - int min_repeat_ms, int cooldown_interval_ms, int repeats_to_cooldown, + int min_repeat_ms, int limit_zero_time_iterations, + int cooldown_interval_ms, int repeats_to_cooldown, const std::string& f_preproc_name) { InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator"); // Remove session mask because we pass dev by parts. @@ -200,13 +201,15 @@ class RPCModuleNode final : public ModuleNode { dev = RemoveRPCSessionMask(dev); if (module_handle_ != nullptr) { - return remote_get_time_evaluator_( - GetRef(this), name, static_cast(dev.device_type), dev.device_id, number, - repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); + return remote_get_time_evaluator_(GetRef(this), name, + static_cast(dev.device_type), dev.device_id, number, + repeat, min_repeat_ms, limit_zero_time_iterations, + cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); } else { - return remote_get_time_evaluator_( - Optional(nullptr), name, static_cast(dev.device_type), dev.device_id, number, - repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); + return remote_get_time_evaluator_(Optional(nullptr), name, + static_cast(dev.device_type), dev.device_id, number, + repeat, min_repeat_ms, limit_zero_time_iterations, + cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); } } @@ -244,7 +247,7 @@ class RPCModuleNode final : public ModuleNode { // The local channel std::shared_ptr sess_; // remote function to get time evaluator - TypedPackedFunc, std::string, int, int, int, int, int, int, int, + TypedPackedFunc, std::string, int, int, int, int, int, int, int, int, std::string)> remote_get_time_evaluator_; // remote function getter for modules. @@ -363,8 +366,9 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) { TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") .set_body_typed([](Optional opt_mod, std::string name, int device_type, int device_id, - int number, int repeat, int min_repeat_ms, int cooldown_interval_ms, - int repeats_to_cooldown, std::string f_preproc_name) { + int number, int repeat, int min_repeat_ms, int limit_zero_time_iterations, + int cooldown_interval_ms, int repeats_to_cooldown, + std::string f_preproc_name) { Device dev; dev.device_type = static_cast(device_type); dev.device_id = device_id; @@ -373,7 +377,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") std::string tkey = m->type_key(); if (tkey == "rpc") { return static_cast(m.operator->()) - ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, + ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, + limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown, f_preproc_name); } else { PackedFunc f_preproc; @@ -386,7 +391,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") PackedFunc pf = m.GetFunction(name, true); CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry"; return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, - cooldown_interval_ms, repeats_to_cooldown, f_preproc); + limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown, f_preproc); } } else { auto* pf = runtime::Registry::Get(name); @@ -399,7 +405,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator") f_preproc = *pf_preproc; } return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, - cooldown_interval_ms, repeats_to_cooldown, f_preproc); + limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown, f_preproc); } }); diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc index 56f586d67046..aa9546f3b71a 100644 --- a/web/emcc/tvmjs_support.cc +++ b/web/emcc/tvmjs_support.cc @@ -169,7 +169,7 @@ class AsyncLocalSession : public LocalSession { try { TVMArgs args(arg_values, arg_type_codes, num_args); PackedFunc retfunc = this->GetTimeEvaluator(args[0], args[1], args[2], args[3], args[4], - args[5], args[6], args[7], args[8]); + args[5], args[6], args[7], args[8], args[9]); TVMRetValue rv; rv = retfunc; this->EncodeReturn(std::move(rv), [&](TVMArgs encoded_args) { @@ -252,7 +252,8 @@ class AsyncLocalSession : public LocalSession { // time evaluator PackedFunc GetTimeEvaluator(Optional opt_mod, std::string name, int device_type, int device_id, int number, int repeat, int min_repeat_ms, - int cooldown_interval_ms, int repeats_to_cooldown) { + int limit_zero_time_iterations, int cooldown_interval_ms, + int repeats_to_cooldown) { Device dev; dev.device_type = static_cast(device_type); dev.device_id = device_id; @@ -261,21 +262,23 @@ class AsyncLocalSession : public LocalSession { Module m = opt_mod.value(); std::string tkey = m->type_key(); return WrapWasmTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms, - cooldown_interval_ms, repeats_to_cooldown); + limit_zero_time_iterations, cooldown_interval_ms, + repeats_to_cooldown); } else { auto* pf = runtime::Registry::Get(name); CHECK(pf != nullptr) << "Cannot find " << name << " in the global function"; - return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, + return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, + limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown); } } // time evaluator PackedFunc WrapWasmTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, - int min_repeat_ms, int cooldown_interval_ms, - int repeats_to_cooldown) { - auto ftimer = [pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, - repeats_to_cooldown](TVMArgs args, TVMRetValue* rv) { + int min_repeat_ms, int limit_zero_time_iterations, + int cooldown_interval_ms, int repeats_to_cooldown) { + auto ftimer = [pf, dev, number, repeat, min_repeat_ms, limit_zero_time_iterations, + cooldown_interval_ms, repeats_to_cooldown](TVMArgs args, TVMRetValue* rv) { // the function is a async function. PackedFunc on_complete = args[args.size() - 1]; // keep argument alive in finvoke so that they @@ -293,7 +296,8 @@ class AsyncLocalSession : public LocalSession { auto* time_exec = runtime::Registry::Get("__async.wasm.TimeExecution"); CHECK(time_exec != nullptr) << "Cannot find wasm.GetTimer in the global function"; (*time_exec)(TypedPackedFunc(finvoke), dev, number, repeat, min_repeat_ms, - cooldown_interval_ms, repeats_to_cooldown, on_complete); + limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown, + on_complete); }; return PackedFunc(ftimer); } diff --git a/web/src/runtime.ts b/web/src/runtime.ts index 8df26aff14c3..8df382dbc837 100644 --- a/web/src/runtime.ts +++ b/web/src/runtime.ts @@ -1058,6 +1058,7 @@ export class Instance implements Disposable { nstep: number, repeat: number, minRepeatMs: number, + limitZeroTimeIterations: number, cooldownIntervalMs: number, repeatsToCooldown: number ): Promise => { @@ -1068,6 +1069,7 @@ export class Instance implements Disposable { for (let i = 0; i < repeat; ++i) { let durationMs = 0.0; + let absoluteZeroTimes = 0; do { if (durationMs > 0.0) { let golden_ratio = 1.618; @@ -1081,7 +1083,10 @@ export class Instance implements Disposable { const tend: number = perf.now(); durationMs = tend - tstart; - } while (durationMs < minRepeatMs); + if (durationMs == 0) { + absoluteZeroTimes++; + } + } while (durationMs < minRepeatMs && absoluteZeroTimes < limitZeroTimeIterations); const speed = durationMs / setupNumber / 1000; result.push(speed); if (cooldownIntervalMs > 0.0 && (i % repeatsToCooldown) == 0 ) {