apache · mehrdadh · Aug 23, 2022 · Aug 22, 2022 · Aug 22, 2022
diff --git a/python/tvm/micro/testing/__init__.py b/python/tvm/micro/testing/__init__.py
@@ -16,5 +16,5 @@
 # under the License.
 
 """Allows the tools specified below to be imported directly from tvm.micro.testing"""
-from .evaluation import tune_model, create_aot_session, evaluate_model_accuracy
+from .evaluation import tune_model, create_aot_session, predict_labels_aot
 from .utils import get_supported_boards, get_target
diff --git a/python/tvm/micro/testing/evaluation.py b/python/tvm/micro/testing/evaluation.py
@@ -133,27 +133,18 @@ def create_aot_session(
     return tvm.micro.Session(project.transport(), timeout_override=timeout_override)
 
 
-# This utility functions was designed ONLY for one input / one output models
-# where the outputs are confidences for different classes.
-def evaluate_model_accuracy(session, aot_executor, input_data, true_labels, runs_per_sample=1):
-    """Evaluates an AOT-compiled model's accuracy and runtime over an RPC session. Works well
-    when used with create_aot_session."""
+def predict_labels_aot(session, aot_executor, input_data, runs_per_sample=1):
+    """Predicts labels for each sample in input_data using host-driven AOT.
+    Returns an iterator of (label, runtime) tuples. This function can only
+    be used with models for which the output is the confidence for each class."""
 
     assert aot_executor.get_num_inputs() == 1
     assert aot_executor.get_num_outputs() == 1
     assert runs_per_sample > 0
 
-    predicted_labels = []
-    aot_runtimes = []
     for sample in input_data:
         aot_executor.get_input(0).copyfrom(sample)
         result = aot_executor.module.time_evaluator("run", session.device, number=runs_per_sample)()
+        predicted_label = aot_executor.get_output(0).numpy().argmax()
         runtime = result.mean
-        output = aot_executor.get_output(0).numpy()
-        predicted_labels.append(output.argmax())
-        aot_runtimes.append(runtime)
-
-    num_correct = sum(u == v for u, v in zip(true_labels, predicted_labels))
-    average_time = sum(aot_runtimes) / len(aot_runtimes)
-    accuracy = num_correct / len(predicted_labels)
-    return average_time, accuracy, predicted_labels
+        yield predicted_label, runtime
diff --git a/tests/micro/common/test_autotune.py b/tests/micro/common/test_autotune.py
@@ -76,17 +76,18 @@ def test_kws_autotune_workflow(platform, board, tmp_path):
             np.random.randint(low=-127, high=128, size=(1, 1960), dtype=np.int8) for x in range(3)
         )
 
-        labels = [0, 0, 0]
-
         # Validate perforance across random runs
-        time, _, _ = tvm.micro.testing.evaluate_model_accuracy(
-            session, aot_executor, samples, labels, runs_per_sample=20
-        )
+        runtimes = [
+            runtime
+            for _, runtime in tvm.micro.testing.predict_labels_aot(
+                session, aot_executor, samples, runs_per_sample=20
+            )
+        ]
         # `time` is the average time taken to execute model inference on the
         # device, measured in seconds. It does not include the time to upload
         # the input data via RPC. On slow boards like the Arduino Due, time
         # is around 0.12 (120 ms), so this gives us plenty of buffer.
-        assert time < 1
+        assert np.median(runtimes) < 1
 
 
 if __name__ == "__main__":