Skip to content

Commit

Permalink
feat: enhance profiling and benchmarking (#3012)
Browse files Browse the repository at this point in the history
Modify `PrintMemoryPlan` in `greedy_memory_planner.cc` for better
handling of tensor indices and scratch buffers.

Fix `total_ticks_per_tag_` usage in `micro_profiler.cc` and add
`ClearEvents` method.

Update `Makefile.inc` and `generic_model_benchmark.cc` to support
alternate memory regions and CRC32 checks for data integrity.

Include compression data in `metrics.cc` allocation records and
handle architecture-specific directives in
`show_meta_data.cc.template`.

BUG=see description
  • Loading branch information
rkuester authored Dec 14, 2024
1 parent a535080 commit b2f2718
Show file tree
Hide file tree
Showing 8 changed files with 228 additions and 41 deletions.
22 changes: 16 additions & 6 deletions tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ char GetOrdinalCharacter(int i) {
} else if (i < 62) {
return 'A' + (i - 36);
}
return '*';
return GetOrdinalCharacter(i % 62);
}

} // namespace
Expand Down Expand Up @@ -335,9 +335,14 @@ void GreedyMemoryPlanner::PrintMemoryPlan() {
CalculateOffsetsIfNeeded();

for (int i = 0; i < buffer_count_; ++i) {
MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d",
GetOrdinalCharacter(i), i, requirements_[i].size,
buffer_offsets_[i], requirements_[i].first_time_used,
char c = '*';
if (requirements_[i].first_time_used != requirements_[i].last_time_used) {
// not a scratch buffer nor subgraph output tensor
c = GetOrdinalCharacter(i);
}
MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d", c,
i, requirements_[i].size, buffer_offsets_[i],
requirements_[i].first_time_used,
requirements_[i].last_time_used);
}

Expand Down Expand Up @@ -379,15 +384,20 @@ void GreedyMemoryPlanner::PrintMemoryPlan() {
const int line_end = ((offset + size) * kLineWidth) / max_size;
for (int n = line_start; n < line_end; ++n) {
if (line[n] == '.') {
line[n] = GetOrdinalCharacter(i);
if (requirements->first_time_used == requirements->last_time_used) {
// scratch buffer or subgraph output tensor
line[n] = '*';
} else {
line[n] = GetOrdinalCharacter(i);
}
} else {
line[n] = '!';
}
}
}
line[kLineWidth] = 0;

MicroPrintf("%s%d: %s (%dk)", t < 10 ? " " : "", t, (const char*)line,
MicroPrintf("%4d: %s (%dk)", t, (const char*)line,
(memory_use + 1023) / 1024);
}
}
Expand Down
19 changes: 14 additions & 5 deletions tensorflow/lite/micro/micro_profiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,14 @@ void MicroProfiler::LogTicksPerTagCsv() {
TFLITE_DCHECK(tags_[i] != nullptr);
int position = FindExistingOrNextPosition(tags_[i]);
TFLITE_DCHECK(position >= 0);
total_ticks_per_tag[position].tag = tags_[i];
total_ticks_per_tag[position].ticks =
total_ticks_per_tag[position].ticks + ticks;
total_ticks_per_tag_[position].tag = tags_[i];
total_ticks_per_tag_[position].ticks =
total_ticks_per_tag_[position].ticks + ticks;
total_ticks += ticks;
}

for (int i = 0; i < num_events_; ++i) {
TicksPerTag each_tag_entry = total_ticks_per_tag[i];
TicksPerTag each_tag_entry = total_ticks_per_tag_[i];
if (each_tag_entry.tag == nullptr) {
break;
}
Expand All @@ -112,12 +112,21 @@ void MicroProfiler::LogTicksPerTagCsv() {
int MicroProfiler::FindExistingOrNextPosition(const char* tag_name) {
int pos = 0;
for (; pos < num_events_; pos++) {
TicksPerTag each_tag_entry = total_ticks_per_tag[pos];
TicksPerTag each_tag_entry = total_ticks_per_tag_[pos];
if (each_tag_entry.tag == nullptr ||
strcmp(each_tag_entry.tag, tag_name) == 0) {
return pos;
}
}
return pos < num_events_ ? pos : -1;
}

void MicroProfiler::ClearEvents() {
for (int i = 0; i < num_events_; i++) {
total_ticks_per_tag_[i].tag = nullptr;
}

num_events_ = 0;
}

} // namespace tflite
6 changes: 3 additions & 3 deletions tensorflow/lite/micro/micro_profiler.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -45,7 +45,7 @@ class MicroProfiler : public MicroProfilerInterface {
virtual void EndEvent(uint32_t event_handle) override;

// Clears all the events that have been currently profiled.
void ClearEvents() { num_events_ = 0; }
void ClearEvents();

// Returns the sum of the ticks taken across all the events. This number
// is only meaningful if all of the events are disjoint (the end time of
Expand Down Expand Up @@ -83,7 +83,7 @@ class MicroProfiler : public MicroProfilerInterface {
// In practice, the number of tags will be much lower than the number of
// events. But it is theoretically possible that each event to be unique and
// hence we allow total_ticks_per_tag to have kMaxEvents entries.
TicksPerTag total_ticks_per_tag[kMaxEvents] = {};
TicksPerTag total_ticks_per_tag_[kMaxEvents] = {};

int FindExistingOrNextPosition(const char* tag_name);

Expand Down
9 changes: 9 additions & 0 deletions tensorflow/lite/micro/tools/benchmarking/Makefile.inc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ endif
$(GENERATED_SRCS_DIR)$(GENERIC_BENCHMARK_MODEL_DIR)$(GENERIC_BENCHMARK_MODEL_NAME)_model_data.h
endif

ifeq ($(ENABLE_COMPRESSION), yes)
ifneq ($(GENERIC_BENCHMARK_ALT_MEM_ATTR),)
CXXFLAGS += -DGENERIC_BENCHMARK_ALT_MEM_ATTR=$(GENERIC_BENCHMARK_ALT_MEM_ATTR)
endif
ifneq ($(GENERIC_BENCHMARK_ALT_MEM_SIZE),)
CXXFLAGS += -DGENERIC_BENCHMARK_ALT_MEM_SIZE=$(GENERIC_BENCHMARK_ALT_MEM_SIZE)
endif
endif

GENERIC_BENCHMARK_SRCS := \
$(MICROLITE_BENCHMARK_ROOT_DIR)/generic_model_benchmark.cc \
$(MICROLITE_BENCHMARK_ROOT_DIR)/metrics.cc \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ function substitute_strings() {
IFS=${SAVED_IFS}
replacement=()
for line in "${lines_array[@]}"; do
line=$(sed -e 's/"/\\"/g' <<< "${line}")
line=$(sed -e 's/\\/\\\\/g' -e 's/"/\\"/g' <<< "${line}")
line=$(printf '"%s",\n ' "${line}")
replacement+=( "${line}" )
done
Expand Down
174 changes: 158 additions & 16 deletions tensorflow/lite/micro/tools/benchmarking/generic_model_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ limitations under the License.
#include <sys/types.h>

#include <cstring>
#include <initializer_list>
#include <memory>
#include <random>
#include <type_traits>
Expand Down Expand Up @@ -56,19 +57,37 @@ limitations under the License.

#endif // defind(GENERIC_BENCHMARK_USING_BUILTIN_MODEL)

#if defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) && \
!defined(GENERIC_BENCHMARK_ALT_MEM_SIZE)
#error "GENERIC_BENCHMARK_ALT_MEM_SIZE missing from CXXFLAGS"
#endif // defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) &&
// !defined(GENERIC_BENCHMARK_ALT_MEM_SIZE)

#if defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) && \
!defined(GENERIC_BENCHMARK_ALT_MEM_ATTR)
#error "GENERIC_BENCHMARK_ALT_MEM_ATTR missing from CXXFLAGS"
#endif // defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) &&
// !defined(GENERIC_BENCHMARK_ALT_MEM_ATTR)

#if defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) && \
defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) && defined(USE_TFLM_COMPRESSION)
#define USE_ALT_DECOMPRESSION_MEM
#endif // defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) &&
// defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) &&
// defined(USE_TFLM_COMPRESSION)

/*
* Generic model benchmark. Evaluates runtime performance of a provided model
* with random inputs.
* Generic model benchmark. Evaluates runtime performance of a provided
* model with random inputs.
*/

namespace tflite {

namespace {

using Profiler = ::tflite::MicroProfiler;

// Seed used for the random input. Input data shouldn't affect invocation timing
// so randomness isn't really needed.
// Seed used for the random input. Input data shouldn't affect invocation
// timing so randomness isn't really needed.
constexpr uint32_t kRandomSeed = 0xFB;

#if !defined(GENERIC_BENCHMARK_USING_BUILTIN_MODEL)
Expand All @@ -80,6 +99,11 @@ constexpr size_t kTensorArenaSize = GENERIC_BENCHMARK_TENSOR_ARENA_SIZE;
constexpr size_t kTensorArenaSize = 5e6 - MODEL_SIZE;
#endif // !defined(GENERIC_BENCHMARK_USING_BUILTIN_MODEL)

#if defined(USE_ALT_DECOMPRESSION_MEM)
constexpr size_t kAltMemorySize = GENERIC_BENCHMARK_ALT_MEM_SIZE;
alignas(16) GENERIC_BENCHMARK_ALT_MEM_ATTR uint8_t g_alt_memory[kAltMemorySize];
#endif // defined(USE_ALT_DECOMPRESSION_MEM)

constexpr int kNumResourceVariable = 100;

void SetRandomInput(const uint32_t random_seed,
Expand Down Expand Up @@ -130,39 +154,146 @@ bool ReadFile(const char* file_name, void* buffer, size_t buffer_size) {
}
#endif // !defined(GENERIC_BENCHMARK_USING_BUILTIN_MODEL)

constexpr uint32_t kCrctabLen = 256;
uint32_t crctab[kCrctabLen];

void GenCRC32Table() {
constexpr uint32_t kPolyN = 0xEDB88320;
for (size_t index = 0; index < kCrctabLen; index++) {
crctab[index] = index;
for (int i = 0; i < 8; i++) {
if (crctab[index] & 1) {
crctab[index] = (crctab[index] >> 1) ^ kPolyN;
} else {
crctab[index] >>= 1;
}
}
}
}

uint32_t ComputeCRC32(const uint8_t* data, const size_t data_length) {
uint32_t crc32 = ~0U;

for (size_t i = 0; i < data_length; i++) {
// crctab is an array of 256 32-bit constants
const uint32_t index = (crc32 ^ data[i]) & (kCrctabLen - 1);
crc32 = (crc32 >> 8) ^ crctab[index];
}

// invert all bits of result
crc32 ^= ~0U;
return crc32;
}

void ShowOutputCRC32(tflite::MicroInterpreter* interpreter) {
GenCRC32Table();
for (size_t i = 0; i < interpreter->outputs_size(); ++i) {
TfLiteTensor* output = interpreter->output_tensor(i);
uint8_t* output_values = tflite::GetTensorData<uint8_t>(output);
uint32_t crc32_value = ComputeCRC32(output_values, output->bytes);
MicroPrintf("Output CRC32: 0x%X", crc32_value);
}
}

void ShowInputCRC32(tflite::MicroInterpreter* interpreter) {
GenCRC32Table();
for (size_t i = 0; i < interpreter->inputs_size(); ++i) {
TfLiteTensor* input = interpreter->input_tensor(i);
uint8_t* input_values = tflite::GetTensorData<uint8_t>(input);
uint32_t crc32_value = ComputeCRC32(input_values, input->bytes);
MicroPrintf("Input CRC32: 0x%X", crc32_value);
}
}

int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) {
Profiler profiler;
static Profiler profiler;
static Profiler profiler2;
TfLiteStatus status;

// use this to keep the application size stable regardless of whether
// compression is being used
#ifdef USE_TFLM_COMPRESSION
constexpr bool using_compression = true;
#else // USE_TFLM_COMPRESSION
constexpr bool using_compression = false;
#endif // USE_TFLM_COMPRESSION

alignas(16) static uint8_t tensor_arena[kTensorArenaSize];

uint32_t event_handle = profiler.BeginEvent("TfliteGetModel");
#ifdef USE_ALT_DECOMPRESSION_MEM
std::initializer_list<tflite::MicroContext::AlternateMemoryRegion>
alt_memory_region = {{g_alt_memory, kAltMemorySize}};
#endif // USE_ALT_DECOMPRESSION_MEM

uint32_t event_handle = profiler.BeginEvent("tflite::GetModel");
const tflite::Model* model = tflite::GetModel(model_data);
profiler.EndEvent(event_handle);

event_handle = profiler.BeginEvent("tflite::CreateOpResolver");
TflmOpResolver op_resolver;
TF_LITE_ENSURE_STATUS(CreateOpResolver(op_resolver));
status = CreateOpResolver(op_resolver);
if (status != kTfLiteOk) {
MicroPrintf("tflite::CreateOpResolver failed");
return -1;
}
profiler.EndEvent(event_handle);

event_handle = profiler.BeginEvent("tflite::RecordingMicroAllocator::Create");
tflite::RecordingMicroAllocator* allocator(
tflite::RecordingMicroAllocator::Create(tensor_arena, kTensorArenaSize));
profiler.EndEvent(event_handle);
event_handle = profiler.BeginEvent("tflite::MicroInterpreter instantiation");
tflite::RecordingMicroInterpreter interpreter(
model, op_resolver, allocator,
tflite::MicroResourceVariables::Create(allocator, kNumResourceVariable),
&profiler);
TF_LITE_ENSURE_STATUS(interpreter.AllocateTensors());
profiler.EndEvent(event_handle);

#ifdef USE_ALT_DECOMPRESSION_MEM
event_handle =
profiler.BeginEvent("tflite::MicroInterpreter::SetDecompressionMemory");
status = interpreter.SetDecompressionMemory(alt_memory_region);
if (status != kTfLiteOk) {
MicroPrintf("tflite::MicroInterpreter::SetDecompressionMemory failed");
return -1;
}
profiler.EndEvent(event_handle);
#endif // USE_ALT_DECOMPRESSION_MEM

event_handle =
profiler.BeginEvent("tflite::MicroInterpreter::AllocateTensors");
status = interpreter.AllocateTensors();
if (status != kTfLiteOk) {
MicroPrintf("tflite::MicroInterpreter::AllocateTensors failed");
return -1;
}
profiler.EndEvent(event_handle);

profiler.Log();
profiler.LogTicksPerTagCsv();
profiler.ClearEvents();

if (using_compression) {
status = interpreter.SetAlternateProfiler(&profiler2);
if (status != kTfLiteOk) {
MicroPrintf("tflite::MicroInterpreter::SetAlternateProfiler failed");
return -1;
}
}

MicroPrintf(""); // null MicroPrintf serves as a newline.

// For streaming models, the interpreter will return kTfLiteAbort if the model
// does not yet have enough data to make an inference. As such, we need to
// invoke the interpreter multiple times until we either receive an error or
// kTfLiteOk. This loop also works for non-streaming models, as they'll just
// return kTfLiteOk after the first invocation.
// For streaming models, the interpreter will return kTfLiteAbort if the
// model does not yet have enough data to make an inference. As such, we
// need to invoke the interpreter multiple times until we either receive an
// error or kTfLiteOk. This loop also works for non-streaming models, as
// they'll just return kTfLiteOk after the first invocation.
uint32_t seed = kRandomSeed;
while (true) {
SetRandomInput(seed++, interpreter);
TfLiteStatus status = interpreter.Invoke();
ShowInputCRC32(&interpreter);
MicroPrintf(""); // null MicroPrintf serves as a newline.

status = interpreter.Invoke();
if ((status != kTfLiteOk) && (static_cast<int>(status) != kTfLiteAbort)) {
MicroPrintf("Model interpreter invocation failed: %d\n", status);
return -1;
Expand All @@ -174,6 +305,17 @@ int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) {
MicroPrintf(""); // null MicroPrintf serves as a newline.
profiler.ClearEvents();

if (using_compression) {
profiler2.Log();
MicroPrintf(""); // null MicroPrintf serves as a newline.
profiler2.LogTicksPerTagCsv();
MicroPrintf(""); // null MicroPrintf serves as a newline.
profiler2.ClearEvents();
}

ShowOutputCRC32(&interpreter);
MicroPrintf(""); // null MicroPrintf serves as a newline.

if (status == kTfLiteOk) {
break;
}
Expand Down
Loading

0 comments on commit b2f2718

Please sign in to comment.