Skip to content

Commit

Permalink
Qualcomm AI Engine Direct - enable loading context binary directly (#…
Browse files Browse the repository at this point in the history
…4163)

Summary:
- add utilities for loading context binary generated from qnn tools
- align env variable naming with qnn
- fix bug in online prepare and extend coverage to support bitwise quatization
- llama7b e2e example from qualcomm ai_hub
- minor fixes for syle & typo

Pull Request resolved: #4163

Reviewed By: swolchok, kirklandsign

Differential Revision: D59737140

Pulled By: cccclai

fbshipit-source-id: 16e98d7f5da7204a2d04258fd75dabd8aa1eaa7d
  • Loading branch information
haowhsu-quic authored and facebook-github-bot committed Jul 17, 2024
1 parent c3357e1 commit 1b0bf1c
Show file tree
Hide file tree
Showing 30 changed files with 2,096 additions and 164 deletions.
3 changes: 3 additions & 0 deletions backends/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
set_target_properties(
PyQnnManagerAdaptor PROPERTIES CXX_VISIBILITY_PRESET hidden
)
set_target_properties(
PyQnnWrapperAdaptor PROPERTIES CXX_VISIBILITY_PRESET hidden
)

target_link_libraries(
PyQnnManagerAdaptor
Expand Down
10 changes: 7 additions & 3 deletions backends/qualcomm/aot/ir/qcir.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,8 @@ enum QuantizeDef : byte {
enum QuantizeType : byte {
SCALE_OFFSET = 0,
AXIS_SCALE_OFFSET,
// TODO: enable
// QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET
// QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET
BW_SCALE_OFFSET,
BW_AXIS_SCALE_OFFSET,
UNDEFINED,
}

Expand All @@ -66,7 +65,12 @@ struct ScaleOffset {
table QuantizeParam {
def: QuantizeDef;
type: QuantizeType;
bitwidth: uint;
axis: int;
// used by bitwidth quantization
scales: [float];
offsets: [int];
// used by general quantization
data: [ScaleOffset];
}

Expand Down
66 changes: 58 additions & 8 deletions backends/qualcomm/aot/ir/qcir_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ qcir::DataType ToDataType(Qnn_DataType_t type) {
{QNN_DATATYPE_FLOAT_16, qcir::DataType::FLOAT16},
{QNN_DATATYPE_FLOAT_32, qcir::DataType::FLOAT32},
// {QNN_DATATYPE_FLOAT_64, qcir::DataType::FLOAT64},
// {QNN_DATATYPE_SFIXED_POINT_4, qcir::DataType::SFIXED4},
{QNN_DATATYPE_SFIXED_POINT_4, qcir::DataType::SFIXED4},
{QNN_DATATYPE_SFIXED_POINT_8, qcir::DataType::SFIXED8},
{QNN_DATATYPE_SFIXED_POINT_16, qcir::DataType::SFIXED16},
{QNN_DATATYPE_SFIXED_POINT_32, qcir::DataType::SFIXED32},
// {QNN_DATATYPE_UFIXED_POINT_4, qcir::DataType::UFIXED4},
{QNN_DATATYPE_UFIXED_POINT_4, qcir::DataType::UFIXED4},
{QNN_DATATYPE_UFIXED_POINT_8, qcir::DataType::UFIXED8},
{QNN_DATATYPE_UFIXED_POINT_16, qcir::DataType::UFIXED16},
{QNN_DATATYPE_UFIXED_POINT_32, qcir::DataType::UFIXED32},
Expand All @@ -84,11 +84,11 @@ Qnn_DataType_t ToDataType(qcir::DataType type) {
{qcir::DataType::FLOAT16, QNN_DATATYPE_FLOAT_16},
{qcir::DataType::FLOAT32, QNN_DATATYPE_FLOAT_32},
// {qcir::DataType::FLOAT64, QNN_DATATYPE_FLOAT_64},
// {qcir::DataType::SFIXED4, QNN_DATATYPE_SFIXED_POINT_4},
{qcir::DataType::SFIXED4, QNN_DATATYPE_SFIXED_POINT_4},
{qcir::DataType::SFIXED8, QNN_DATATYPE_SFIXED_POINT_8},
{qcir::DataType::SFIXED16, QNN_DATATYPE_SFIXED_POINT_16},
{qcir::DataType::SFIXED32, QNN_DATATYPE_SFIXED_POINT_32},
// {qcir::DataType::UFIXED4, QNN_DATATYPE_UFIXED_POINT_4},
{qcir::DataType::UFIXED4, QNN_DATATYPE_UFIXED_POINT_4},
{qcir::DataType::UFIXED8, QNN_DATATYPE_UFIXED_POINT_8},
{qcir::DataType::UFIXED16, QNN_DATATYPE_UFIXED_POINT_16},
{qcir::DataType::UFIXED32, QNN_DATATYPE_UFIXED_POINT_32},
Expand All @@ -114,13 +114,20 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
qcir::QuantizeType::SCALE_OFFSET},
{QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET,
qcir::QuantizeType::AXIS_SCALE_OFFSET},
{QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET,
qcir::QuantizeType::BW_SCALE_OFFSET},
{QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET,
qcir::QuantizeType::BW_AXIS_SCALE_OFFSET},
{QNN_QUANTIZATION_ENCODING_UNDEFINED,
qcir::QuantizeType::UNDEFINED},
};

int axis = 0;
int32_t axis = 0;
uint32_t bitwidth = 0;
auto quant_type = type_map.at(param.quantizationEncoding);
std::vector<qcir::ScaleOffset> data;
std::vector<float> scales;
std::vector<int32_t> offsets;
switch (quant_type) {
case qcir::QuantizeType::SCALE_OFFSET: {
data.emplace_back(qcir::ScaleOffset(
Expand All @@ -129,17 +136,42 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
case qcir::QuantizeType::AXIS_SCALE_OFFSET: {
size_t len = param.axisScaleOffsetEncoding.numScaleOffsets;
axis = param.axisScaleOffsetEncoding.axis;
data.reserve(len);
for (uint i = 0; i < len; ++i) {
data.emplace_back(qcir::ScaleOffset(
param.axisScaleOffsetEncoding.scaleOffset[i].scale,
param.axisScaleOffsetEncoding.scaleOffset[i].offset));
}
} break;
case qcir::QuantizeType::BW_SCALE_OFFSET: {
bitwidth = param.bwScaleOffsetEncoding.bitwidth;
scales.push_back(param.bwScaleOffsetEncoding.scale);
offsets.push_back(param.bwScaleOffsetEncoding.offset);
} break;
case qcir::QuantizeType::BW_AXIS_SCALE_OFFSET: {
bitwidth = param.bwAxisScaleOffsetEncoding.bitwidth;
axis = param.bwAxisScaleOffsetEncoding.axis;
size_t len = param.bwAxisScaleOffsetEncoding.numElements;
scales.reserve(len);
offsets.reserve(len);
for (size_t i = 0; i < len; ++i) {
scales.push_back(param.bwAxisScaleOffsetEncoding.scales[i]);
offsets.push_back(param.bwAxisScaleOffsetEncoding.offsets[i]);
}
} break;
default:
QNN_EXECUTORCH_LOG_ERROR("QNN_QUANTIZATION_ENCODING_UNDEFINED detected");
break;
}
return CreateQuantizeParamDirect(
*builder, def_map.at(param.encodingDefinition), quant_type, axis, &data);
*builder,
def_map.at(param.encodingDefinition),
quant_type,
bitwidth,
axis,
&scales,
&offsets,
&data);
}

Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
Expand All @@ -155,6 +187,10 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
QNN_QUANTIZATION_ENCODING_SCALE_OFFSET},
{qcir::QuantizeType::AXIS_SCALE_OFFSET,
QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET},
{qcir::QuantizeType::BW_SCALE_OFFSET,
QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET},
{qcir::QuantizeType::BW_AXIS_SCALE_OFFSET,
QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET},
{qcir::QuantizeType::UNDEFINED,
QNN_QUANTIZATION_ENCODING_UNDEFINED},
};
Expand All @@ -174,7 +210,22 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
reinterpret_cast<Qnn_ScaleOffset_t*>(
const_cast<uint8_t*>(param->data()->Data()));
} break;
case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET: {
p.bwAxisScaleOffsetEncoding.bitwidth = param->bitwidth();
p.bwScaleOffsetEncoding.scale = param->scales()->Get(0);
p.bwScaleOffsetEncoding.offset = param->offsets()->Get(0);
} break;
case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: {
p.bwAxisScaleOffsetEncoding.bitwidth = param->bitwidth();
p.bwAxisScaleOffsetEncoding.axis = param->axis();
p.bwAxisScaleOffsetEncoding.numElements = param->scales()->size();
p.bwAxisScaleOffsetEncoding.scales =
const_cast<float*>(param->scales()->data());
p.bwAxisScaleOffsetEncoding.offsets =
const_cast<int32_t*>(param->offsets()->data());
} break;
default:
QNN_EXECUTORCH_LOG_ERROR("qcir::QuantizeType::UNDEFINED detected");
break;
}
return p;
Expand Down Expand Up @@ -212,8 +263,7 @@ Qnn_Tensor_t ToTensor(const tensor_type& tensor) {
QNN_VER_PTR(t)->dataType = ToDataType(tensor->dtype());
QNN_VER_PTR(t)->quantizeParams = ToQuantizeParam(tensor->qparam());
QNN_VER_PTR(t)->rank = tensor->shape()->size();
QNN_VER_PTR(t)->dimensions = reinterpret_cast<uint32_t*>(
const_cast<uint8_t*>(tensor->shape()->Data()));
QNN_VER_PTR(t)->dimensions = const_cast<uint32_t*>(tensor->shape()->data());
QNN_VER_PTR(t)->clientBuf.dataSize = tensor->data()->size();
QNN_VER_PTR(t)->clientBuf.data = is_io_tensor(QNN_VER_PTR(t)->type)
? nullptr
Expand Down
6 changes: 5 additions & 1 deletion backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,16 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {

py::class_<PyQnnManager, std::shared_ptr<PyQnnManager>>(m, "QnnManager")
.def(py::init<const py::bytes&>())
.def(py::init<const py::bytes&, const py::bytes&>())
.def("Init", &PyQnnManager::Init)
.def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend)
.def("Compile", &PyQnnManager::Compile)
.def("Destroy", &PyQnnManager::Destroy)
.def("IsAvailable", &PyQnnManager::IsAvailable)
.def("IsTensorDump", &PyQnnManager::IsTensorDump);
.def("IsTensorDump", &PyQnnManager::IsTensorDump)
.def("AllocateTensor", &PyQnnManager::AllocateTensor)
.def("GetGraphInputs", &PyQnnManager::GetGraphInputs)
.def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs);
}
} // namespace qnn
} // namespace executor
Expand Down
36 changes: 36 additions & 0 deletions backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/
#pragma once
#include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
#include <executorch/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h>
#include <executorch/backends/qualcomm/runtime/Logging.h>
#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
#include <executorch/backends/qualcomm/runtime/QnnManager.h>
Expand All @@ -23,6 +24,7 @@ namespace executor {
namespace qnn {
class PyQnnManager {
public:
// used for AoT compilation
explicit PyQnnManager(const py::bytes& buffer)
: qnn_executorch_option_ptr_(buffer),
qnn_executorch_context_binary_(QNN_EXECUTORCH_CONTEXT_BINARY) {
Expand All @@ -33,6 +35,18 @@ class PyQnnManager {
qnn_manager_ = std::make_shared<QnnManager>(
qnn_executorch_options, qnn_executorch_context_binary_);
}
// used for loading context binary directly
explicit PyQnnManager(const py::bytes& buffer, const py::bytes& ctx_bin)
: qnn_executorch_option_ptr_(buffer) {
auto qnn_executorch_options = GetQnnExecuTorchOptions(
qnn_executorch_option_ptr_.cast<std::string_view>().data());

py::buffer_info info(py::buffer(ctx_bin).request());
qnn_executorch_context_binary_.buffer = static_cast<void*>(info.ptr);
qnn_executorch_context_binary_.nbytes = info.size * info.itemsize;
qnn_manager_ = std::make_shared<QnnManager>(
qnn_executorch_options, qnn_executorch_context_binary_);
}

Error Init() {
return qnn_manager_->Init();
Expand Down Expand Up @@ -141,6 +155,28 @@ class PyQnnManager {
return qnn_manager_->IsTensorDump();
}

Error AllocateTensor() {
return qnn_manager_->AllocateTensor();
}

py::list GetGraphInputs() {
py::list ret;
for (const std::shared_ptr<TensorWrapper>& input :
qnn_manager_->GetGraphInputs()) {
ret.append(PyQnnTensorWrapper(input));
}
return ret;
}

py::list GetGraphOutputs() {
py::list ret;
for (const std::shared_ptr<TensorWrapper>& output :
qnn_manager_->GetGraphOutputs()) {
ret.append(PyQnnTensorWrapper(output));
}
return ret;
}

private:
// Store the bytes object instead of a raw pointer so that this module will
// keep the bytes alive.
Expand Down
14 changes: 14 additions & 0 deletions backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ std::shared_ptr<TensorWrapper> CreateTensorWrapper(
}

PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
PYBIND11_NUMPY_DTYPE(PyQnnTensorWrapper::EncodingData, scale, offset);

py::enum_<Qnn_TensorType_t>(m, "Qnn_TensorType_t")
.value(
"QNN_TENSOR_TYPE_APP_WRITE",
Expand Down Expand Up @@ -234,6 +236,18 @@ PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
"GetOpWrapper",
&PyQnnOpWrapper::GetOpWrapper,
"A function which get op wrapper");

py::class_<PyQnnTensorWrapper::Encoding>(m, "Encoding")
.def_readonly("data", &PyQnnTensorWrapper::Encoding::data)
.def_readonly("axis", &PyQnnTensorWrapper::Encoding::axis);

py::class_<PyQnnTensorWrapper, std::shared_ptr<PyQnnTensorWrapper>>(
m, "PyQnnTensorWrapper")
.def(py::init<const std::shared_ptr<TensorWrapper>&>())
.def("GetDims", &PyQnnTensorWrapper::GetDims)
.def("GetDataType", &PyQnnTensorWrapper::GetDataType)
.def("GetName", &PyQnnTensorWrapper::GetName)
.def("GetEncodings", &PyQnnTensorWrapper::GetEncodings);
}
} // namespace qnn
} // namespace executor
Expand Down
88 changes: 87 additions & 1 deletion backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ class PyQnnOpWrapper {
name, data_type, attrData["data"].cast<bool>());
break;
default:
QNN_EXECUTORCH_LOG_ERROR("tensor.v1.name: %d", data_type);
QNN_EXECUTORCH_LOG_ERROR(
"%s has invalid data type: %d", name, data_type);
break;
}
}
Expand All @@ -96,6 +97,91 @@ class PyQnnOpWrapper {
private:
std::shared_ptr<OpWrapper> op_wrapper_;
};

class PyQnnTensorWrapper {
public:
explicit PyQnnTensorWrapper(const std::shared_ptr<TensorWrapper>& wrapper) {
tensor_wrapper_ = wrapper;
}
struct EncodingData {
float scale;
int32_t offset;
};
struct Encoding {
py::array_t<EncodingData> data;
int32_t axis;
};

py::array_t<std::uint32_t> GetDims() {
std::uint32_t* dim = tensor_wrapper_->GetDims();
size_t shape[1]{tensor_wrapper_->GetRank()};
size_t stride[1]{sizeof(std::uint32_t)};
auto ret = py::array_t<std::uint32_t>(shape, stride);
auto view = ret.mutable_unchecked<1>();
for (int i = 0; i < ret.shape(0); ++i) {
view(i) = dim[i];
}
return ret;
}
std::string GetName() {
return tensor_wrapper_->GetName();
}
Qnn_DataType_t GetDataType() {
return tensor_wrapper_->GetDataType();
}
Encoding GetEncodings() {
auto q_param = tensor_wrapper_->GetQuantizeParams();
size_t stride[1]{sizeof(EncodingData)};

switch (q_param.quantizationEncoding) {
case QNN_QUANTIZATION_ENCODING_SCALE_OFFSET: {
Qnn_ScaleOffset_t data = q_param.scaleOffsetEncoding;
size_t shape[1]{1};
auto enc_data = py::array_t<EncodingData>(shape, stride);
auto view = enc_data.mutable_unchecked<1>();
view(0) = {data.scale, data.offset};
return {enc_data, -1};
}
case QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET: {
Qnn_AxisScaleOffset_t data = q_param.axisScaleOffsetEncoding;
size_t shape[1]{data.numScaleOffsets};
auto enc_data = py::array_t<EncodingData>(shape, stride);
auto view = enc_data.mutable_unchecked<1>();
for (int i = 0; i < enc_data.shape(0); ++i) {
view(i) = {data.scaleOffset[i].scale, data.scaleOffset[i].offset};
}
return {enc_data, data.axis};
}
case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET: {
Qnn_BwScaleOffset_t data = q_param.bwScaleOffsetEncoding;
size_t shape[1]{1};
auto enc_data = py::array_t<EncodingData>(shape, stride);
auto view = enc_data.mutable_unchecked<1>();
view(0) = {data.scale, data.offset};
return {enc_data, -1};
}
case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: {
Qnn_BwAxisScaleOffset_t data = q_param.bwAxisScaleOffsetEncoding;
size_t shape[1]{data.numElements};
auto enc_data = py::array_t<EncodingData>(shape, stride);
auto view = enc_data.mutable_unchecked<1>();
for (int i = 0; i < enc_data.shape(0); ++i) {
view(i) = {data.scales[i], data.offsets[i]};
}
return {enc_data, data.axis};
}
default:
QNN_EXECUTORCH_LOG_ERROR(
"%s QNN_QUANTIZATION_ENCODING_UNDEFINED detected",
GetName().c_str());
break;
}
return {};
}

private:
std::shared_ptr<TensorWrapper> tensor_wrapper_;
};
} // namespace qnn
} // namespace executor
} // namespace torch
6 changes: 5 additions & 1 deletion backends/qualcomm/aot/wrappers/TensorWrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,11 @@ class TensorWrapper {
return QNN_VER_PTR(tensor_)->memType;
};

std::string GetName() const {
Qnn_QuantizeParams_t GetQuantizeParams() const {
return QNN_VER_PTR(tensor_)->quantizeParams;
}

const std::string& GetName() const {
return qnn_tensor_name_;
};

Expand Down
Loading

0 comments on commit 1b0bf1c

Please sign in to comment.