From 1b0bf1c3e8cd8c7893fbc2e123b740909b1f1342 Mon Sep 17 00:00:00 2001
From: haowhsu-quic <quic_haowhsu@quicinc.com>
Date: Wed, 17 Jul 2024 15:02:14 -0700
Subject: [PATCH] Qualcomm AI Engine Direct - enable loading context binary
 directly (#4163)

Summary:
- add utilities for loading context binary generated from qnn tools
- align env variable naming with qnn
- fix bug in online prepare and extend coverage to support bitwise quatization
- llama7b e2e example from qualcomm ai_hub
- minor fixes for syle & typo

Pull Request resolved: https://github.com/pytorch/executorch/pull/4163

Reviewed By: swolchok, kirklandsign

Differential Revision: D59737140

Pulled By: cccclai

fbshipit-source-id: 16e98d7f5da7204a2d04258fd75dabd8aa1eaa7d
---
 backends/qualcomm/CMakeLists.txt              |   3 +
 backends/qualcomm/aot/ir/qcir.fbs             |  10 +-
 backends/qualcomm/aot/ir/qcir_utils.cpp       |  66 ++-
 .../aot/python/PyQnnManagerAdaptor.cpp        |   6 +-
 .../qualcomm/aot/python/PyQnnManagerAdaptor.h |  36 ++
 .../aot/python/PyQnnWrapperAdaptor.cpp        |  14 +
 .../qualcomm/aot/python/PyQnnWrapperAdaptor.h |  88 +++-
 .../qualcomm/aot/wrappers/TensorWrapper.h     |   6 +-
 backends/qualcomm/builders/qnn_constants.py   |   6 +
 backends/qualcomm/qnn_preprocess.py           |  23 +-
 .../qualcomm/runtime/QnnExecuTorchBackend.cpp |   3 +-
 backends/qualcomm/runtime/QnnManager.cpp      |  13 +-
 .../runtime/backends/QnnBackendCache.cpp      |  26 +-
 .../runtime/backends/QnnGraphCommon.cpp       |   2 +-
 backends/qualcomm/scripts/build.sh            |  12 +-
 backends/qualcomm/setup.md                    |   6 +-
 backends/qualcomm/tests/models.py             |  13 +
 backends/qualcomm/tests/test_qnn_delegate.py  |  68 ++-
 backends/qualcomm/tests/utils.py              |  82 ++-
 backends/qualcomm/utils/utils.py              | 161 +++++-
 examples/qualcomm/CMakeLists.txt              | 101 ++--
 .../qnn_qaihub_llama_runner.cpp               |  84 +++
 examples/qualcomm/llama2/README.md            |  43 +-
 examples/qualcomm/llama2/llama_qaihub.py      | 235 +++++++++
 .../llama2/qaihub_runner/io_memory.cpp        | 481 ++++++++++++++++++
 .../qualcomm/llama2/qaihub_runner/io_memory.h | 149 ++++++
 .../qualcomm/llama2/qaihub_runner/runner.cpp  | 359 +++++++++++++
 .../qualcomm/llama2/qaihub_runner/runner.h    | 106 ++++
 examples/qualcomm/llama2/runner/runner.cpp    |   4 +-
 examples/qualcomm/scripts/utils.py            |  54 +-
 30 files changed, 2096 insertions(+), 164 deletions(-)
 create mode 100644 examples/qualcomm/executor_runner/qnn_qaihub_llama_runner.cpp
 create mode 100644 examples/qualcomm/llama2/llama_qaihub.py
 create mode 100644 examples/qualcomm/llama2/qaihub_runner/io_memory.cpp
 create mode 100644 examples/qualcomm/llama2/qaihub_runner/io_memory.h
 create mode 100644 examples/qualcomm/llama2/qaihub_runner/runner.cpp
 create mode 100644 examples/qualcomm/llama2/qaihub_runner/runner.h

diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index a840ab0bb9..e5f365d3ea 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -235,6 +235,9 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
   set_target_properties(
     PyQnnManagerAdaptor PROPERTIES CXX_VISIBILITY_PRESET hidden
   )
+  set_target_properties(
+    PyQnnWrapperAdaptor PROPERTIES CXX_VISIBILITY_PRESET hidden
+  )
 
   target_link_libraries(
     PyQnnManagerAdaptor
diff --git a/backends/qualcomm/aot/ir/qcir.fbs b/backends/qualcomm/aot/ir/qcir.fbs
index b591624972..2d8b1f78fe 100755
--- a/backends/qualcomm/aot/ir/qcir.fbs
+++ b/backends/qualcomm/aot/ir/qcir.fbs
@@ -52,9 +52,8 @@ enum QuantizeDef : byte {
 enum QuantizeType : byte {
     SCALE_OFFSET = 0,
     AXIS_SCALE_OFFSET,
-    // TODO: enable
-    //   QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET
-    //   QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET
+    BW_SCALE_OFFSET,
+    BW_AXIS_SCALE_OFFSET,
     UNDEFINED,
 }
 
@@ -66,7 +65,12 @@ struct ScaleOffset {
 table QuantizeParam {
     def: QuantizeDef;
     type: QuantizeType;
+    bitwidth: uint;
     axis: int;
+    // used by bitwidth quantization
+    scales: [float];
+    offsets: [int];
+    // used by general quantization
     data: [ScaleOffset];
 }
 
diff --git a/backends/qualcomm/aot/ir/qcir_utils.cpp b/backends/qualcomm/aot/ir/qcir_utils.cpp
index d32c36149f..e025b8667a 100755
--- a/backends/qualcomm/aot/ir/qcir_utils.cpp
+++ b/backends/qualcomm/aot/ir/qcir_utils.cpp
@@ -55,11 +55,11 @@ qcir::DataType ToDataType(Qnn_DataType_t type) {
       {QNN_DATATYPE_FLOAT_16, qcir::DataType::FLOAT16},
       {QNN_DATATYPE_FLOAT_32, qcir::DataType::FLOAT32},
       // {QNN_DATATYPE_FLOAT_64, qcir::DataType::FLOAT64},
-      // {QNN_DATATYPE_SFIXED_POINT_4, qcir::DataType::SFIXED4},
+      {QNN_DATATYPE_SFIXED_POINT_4, qcir::DataType::SFIXED4},
       {QNN_DATATYPE_SFIXED_POINT_8, qcir::DataType::SFIXED8},
       {QNN_DATATYPE_SFIXED_POINT_16, qcir::DataType::SFIXED16},
       {QNN_DATATYPE_SFIXED_POINT_32, qcir::DataType::SFIXED32},
-      // {QNN_DATATYPE_UFIXED_POINT_4, qcir::DataType::UFIXED4},
+      {QNN_DATATYPE_UFIXED_POINT_4, qcir::DataType::UFIXED4},
       {QNN_DATATYPE_UFIXED_POINT_8, qcir::DataType::UFIXED8},
       {QNN_DATATYPE_UFIXED_POINT_16, qcir::DataType::UFIXED16},
       {QNN_DATATYPE_UFIXED_POINT_32, qcir::DataType::UFIXED32},
@@ -84,11 +84,11 @@ Qnn_DataType_t ToDataType(qcir::DataType type) {
       {qcir::DataType::FLOAT16, QNN_DATATYPE_FLOAT_16},
       {qcir::DataType::FLOAT32, QNN_DATATYPE_FLOAT_32},
       // {qcir::DataType::FLOAT64, QNN_DATATYPE_FLOAT_64},
-      // {qcir::DataType::SFIXED4, QNN_DATATYPE_SFIXED_POINT_4},
+      {qcir::DataType::SFIXED4, QNN_DATATYPE_SFIXED_POINT_4},
       {qcir::DataType::SFIXED8, QNN_DATATYPE_SFIXED_POINT_8},
       {qcir::DataType::SFIXED16, QNN_DATATYPE_SFIXED_POINT_16},
       {qcir::DataType::SFIXED32, QNN_DATATYPE_SFIXED_POINT_32},
-      // {qcir::DataType::UFIXED4, QNN_DATATYPE_UFIXED_POINT_4},
+      {qcir::DataType::UFIXED4, QNN_DATATYPE_UFIXED_POINT_4},
       {qcir::DataType::UFIXED8, QNN_DATATYPE_UFIXED_POINT_8},
       {qcir::DataType::UFIXED16, QNN_DATATYPE_UFIXED_POINT_16},
       {qcir::DataType::UFIXED32, QNN_DATATYPE_UFIXED_POINT_32},
@@ -114,13 +114,20 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
                qcir::QuantizeType::SCALE_OFFSET},
               {QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET,
                qcir::QuantizeType::AXIS_SCALE_OFFSET},
+              {QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET,
+               qcir::QuantizeType::BW_SCALE_OFFSET},
+              {QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET,
+               qcir::QuantizeType::BW_AXIS_SCALE_OFFSET},
               {QNN_QUANTIZATION_ENCODING_UNDEFINED,
                qcir::QuantizeType::UNDEFINED},
           };
 
-  int axis = 0;
+  int32_t axis = 0;
+  uint32_t bitwidth = 0;
   auto quant_type = type_map.at(param.quantizationEncoding);
   std::vector<qcir::ScaleOffset> data;
+  std::vector<float> scales;
+  std::vector<int32_t> offsets;
   switch (quant_type) {
     case qcir::QuantizeType::SCALE_OFFSET: {
       data.emplace_back(qcir::ScaleOffset(
@@ -129,17 +136,42 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
     case qcir::QuantizeType::AXIS_SCALE_OFFSET: {
       size_t len = param.axisScaleOffsetEncoding.numScaleOffsets;
       axis = param.axisScaleOffsetEncoding.axis;
+      data.reserve(len);
       for (uint i = 0; i < len; ++i) {
         data.emplace_back(qcir::ScaleOffset(
             param.axisScaleOffsetEncoding.scaleOffset[i].scale,
             param.axisScaleOffsetEncoding.scaleOffset[i].offset));
       }
     } break;
+    case qcir::QuantizeType::BW_SCALE_OFFSET: {
+      bitwidth = param.bwScaleOffsetEncoding.bitwidth;
+      scales.push_back(param.bwScaleOffsetEncoding.scale);
+      offsets.push_back(param.bwScaleOffsetEncoding.offset);
+    } break;
+    case qcir::QuantizeType::BW_AXIS_SCALE_OFFSET: {
+      bitwidth = param.bwAxisScaleOffsetEncoding.bitwidth;
+      axis = param.bwAxisScaleOffsetEncoding.axis;
+      size_t len = param.bwAxisScaleOffsetEncoding.numElements;
+      scales.reserve(len);
+      offsets.reserve(len);
+      for (size_t i = 0; i < len; ++i) {
+        scales.push_back(param.bwAxisScaleOffsetEncoding.scales[i]);
+        offsets.push_back(param.bwAxisScaleOffsetEncoding.offsets[i]);
+      }
+    } break;
     default:
+      QNN_EXECUTORCH_LOG_ERROR("QNN_QUANTIZATION_ENCODING_UNDEFINED detected");
       break;
   }
   return CreateQuantizeParamDirect(
-      *builder, def_map.at(param.encodingDefinition), quant_type, axis, &data);
+      *builder,
+      def_map.at(param.encodingDefinition),
+      quant_type,
+      bitwidth,
+      axis,
+      &scales,
+      &offsets,
+      &data);
 }
 
 Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
@@ -155,6 +187,10 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
                QNN_QUANTIZATION_ENCODING_SCALE_OFFSET},
               {qcir::QuantizeType::AXIS_SCALE_OFFSET,
                QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET},
+              {qcir::QuantizeType::BW_SCALE_OFFSET,
+               QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET},
+              {qcir::QuantizeType::BW_AXIS_SCALE_OFFSET,
+               QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET},
               {qcir::QuantizeType::UNDEFINED,
                QNN_QUANTIZATION_ENCODING_UNDEFINED},
           };
@@ -174,7 +210,22 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
           reinterpret_cast<Qnn_ScaleOffset_t*>(
               const_cast<uint8_t*>(param->data()->Data()));
     } break;
+    case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET: {
+      p.bwAxisScaleOffsetEncoding.bitwidth = param->bitwidth();
+      p.bwScaleOffsetEncoding.scale = param->scales()->Get(0);
+      p.bwScaleOffsetEncoding.offset = param->offsets()->Get(0);
+    } break;
+    case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: {
+      p.bwAxisScaleOffsetEncoding.bitwidth = param->bitwidth();
+      p.bwAxisScaleOffsetEncoding.axis = param->axis();
+      p.bwAxisScaleOffsetEncoding.numElements = param->scales()->size();
+      p.bwAxisScaleOffsetEncoding.scales =
+          const_cast<float*>(param->scales()->data());
+      p.bwAxisScaleOffsetEncoding.offsets =
+          const_cast<int32_t*>(param->offsets()->data());
+    } break;
     default:
+      QNN_EXECUTORCH_LOG_ERROR("qcir::QuantizeType::UNDEFINED detected");
       break;
   }
   return p;
@@ -212,8 +263,7 @@ Qnn_Tensor_t ToTensor(const tensor_type& tensor) {
   QNN_VER_PTR(t)->dataType = ToDataType(tensor->dtype());
   QNN_VER_PTR(t)->quantizeParams = ToQuantizeParam(tensor->qparam());
   QNN_VER_PTR(t)->rank = tensor->shape()->size();
-  QNN_VER_PTR(t)->dimensions = reinterpret_cast<uint32_t*>(
-      const_cast<uint8_t*>(tensor->shape()->Data()));
+  QNN_VER_PTR(t)->dimensions = const_cast<uint32_t*>(tensor->shape()->data());
   QNN_VER_PTR(t)->clientBuf.dataSize = tensor->data()->size();
   QNN_VER_PTR(t)->clientBuf.data = is_io_tensor(QNN_VER_PTR(t)->type)
       ? nullptr
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
index 77f6a63f62..c785fd0219 100644
--- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
+++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
@@ -26,12 +26,16 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {
 
   py::class_<PyQnnManager, std::shared_ptr<PyQnnManager>>(m, "QnnManager")
       .def(py::init<const py::bytes&>())
+      .def(py::init<const py::bytes&, const py::bytes&>())
       .def("Init", &PyQnnManager::Init)
       .def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend)
       .def("Compile", &PyQnnManager::Compile)
       .def("Destroy", &PyQnnManager::Destroy)
       .def("IsAvailable", &PyQnnManager::IsAvailable)
-      .def("IsTensorDump", &PyQnnManager::IsTensorDump);
+      .def("IsTensorDump", &PyQnnManager::IsTensorDump)
+      .def("AllocateTensor", &PyQnnManager::AllocateTensor)
+      .def("GetGraphInputs", &PyQnnManager::GetGraphInputs)
+      .def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs);
 }
 } // namespace qnn
 } // namespace executor
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
index 5bde58687f..4a675067f3 100644
--- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
+++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
@@ -7,6 +7,7 @@
  */
 #pragma once
 #include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
+#include <executorch/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h>
 #include <executorch/backends/qualcomm/runtime/Logging.h>
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
@@ -23,6 +24,7 @@ namespace executor {
 namespace qnn {
 class PyQnnManager {
  public:
+  // used for AoT compilation
   explicit PyQnnManager(const py::bytes& buffer)
       : qnn_executorch_option_ptr_(buffer),
         qnn_executorch_context_binary_(QNN_EXECUTORCH_CONTEXT_BINARY) {
@@ -33,6 +35,18 @@ class PyQnnManager {
     qnn_manager_ = std::make_shared<QnnManager>(
         qnn_executorch_options, qnn_executorch_context_binary_);
   }
+  // used for loading context binary directly
+  explicit PyQnnManager(const py::bytes& buffer, const py::bytes& ctx_bin)
+      : qnn_executorch_option_ptr_(buffer) {
+    auto qnn_executorch_options = GetQnnExecuTorchOptions(
+        qnn_executorch_option_ptr_.cast<std::string_view>().data());
+
+    py::buffer_info info(py::buffer(ctx_bin).request());
+    qnn_executorch_context_binary_.buffer = static_cast<void*>(info.ptr);
+    qnn_executorch_context_binary_.nbytes = info.size * info.itemsize;
+    qnn_manager_ = std::make_shared<QnnManager>(
+        qnn_executorch_options, qnn_executorch_context_binary_);
+  }
 
   Error Init() {
     return qnn_manager_->Init();
@@ -141,6 +155,28 @@ class PyQnnManager {
     return qnn_manager_->IsTensorDump();
   }
 
+  Error AllocateTensor() {
+    return qnn_manager_->AllocateTensor();
+  }
+
+  py::list GetGraphInputs() {
+    py::list ret;
+    for (const std::shared_ptr<TensorWrapper>& input :
+         qnn_manager_->GetGraphInputs()) {
+      ret.append(PyQnnTensorWrapper(input));
+    }
+    return ret;
+  }
+
+  py::list GetGraphOutputs() {
+    py::list ret;
+    for (const std::shared_ptr<TensorWrapper>& output :
+         qnn_manager_->GetGraphOutputs()) {
+      ret.append(PyQnnTensorWrapper(output));
+    }
+    return ret;
+  }
+
  private:
   // Store the bytes object instead of a raw pointer so that this module will
   // keep the bytes alive.
diff --git a/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp
index 43d0b58840..b7287fd5af 100644
--- a/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp
+++ b/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp
@@ -104,6 +104,8 @@ std::shared_ptr<TensorWrapper> CreateTensorWrapper(
 }
 
 PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
+  PYBIND11_NUMPY_DTYPE(PyQnnTensorWrapper::EncodingData, scale, offset);
+
   py::enum_<Qnn_TensorType_t>(m, "Qnn_TensorType_t")
       .value(
           "QNN_TENSOR_TYPE_APP_WRITE",
@@ -234,6 +236,18 @@ PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
           "GetOpWrapper",
           &PyQnnOpWrapper::GetOpWrapper,
           "A function which get op wrapper");
+
+  py::class_<PyQnnTensorWrapper::Encoding>(m, "Encoding")
+      .def_readonly("data", &PyQnnTensorWrapper::Encoding::data)
+      .def_readonly("axis", &PyQnnTensorWrapper::Encoding::axis);
+
+  py::class_<PyQnnTensorWrapper, std::shared_ptr<PyQnnTensorWrapper>>(
+      m, "PyQnnTensorWrapper")
+      .def(py::init<const std::shared_ptr<TensorWrapper>&>())
+      .def("GetDims", &PyQnnTensorWrapper::GetDims)
+      .def("GetDataType", &PyQnnTensorWrapper::GetDataType)
+      .def("GetName", &PyQnnTensorWrapper::GetName)
+      .def("GetEncodings", &PyQnnTensorWrapper::GetEncodings);
 }
 } // namespace qnn
 } // namespace executor
diff --git a/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h b/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h
index f80bb67692..f13b5962b7 100644
--- a/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h
+++ b/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h
@@ -85,7 +85,8 @@ class PyQnnOpWrapper {
             name, data_type, attrData["data"].cast<bool>());
         break;
       default:
-        QNN_EXECUTORCH_LOG_ERROR("tensor.v1.name: %d", data_type);
+        QNN_EXECUTORCH_LOG_ERROR(
+            "%s has invalid data type: %d", name, data_type);
         break;
     }
   }
@@ -96,6 +97,91 @@ class PyQnnOpWrapper {
  private:
   std::shared_ptr<OpWrapper> op_wrapper_;
 };
+
+class PyQnnTensorWrapper {
+ public:
+  explicit PyQnnTensorWrapper(const std::shared_ptr<TensorWrapper>& wrapper) {
+    tensor_wrapper_ = wrapper;
+  }
+  struct EncodingData {
+    float scale;
+    int32_t offset;
+  };
+  struct Encoding {
+    py::array_t<EncodingData> data;
+    int32_t axis;
+  };
+
+  py::array_t<std::uint32_t> GetDims() {
+    std::uint32_t* dim = tensor_wrapper_->GetDims();
+    size_t shape[1]{tensor_wrapper_->GetRank()};
+    size_t stride[1]{sizeof(std::uint32_t)};
+    auto ret = py::array_t<std::uint32_t>(shape, stride);
+    auto view = ret.mutable_unchecked<1>();
+    for (int i = 0; i < ret.shape(0); ++i) {
+      view(i) = dim[i];
+    }
+    return ret;
+  }
+  std::string GetName() {
+    return tensor_wrapper_->GetName();
+  }
+  Qnn_DataType_t GetDataType() {
+    return tensor_wrapper_->GetDataType();
+  }
+  Encoding GetEncodings() {
+    auto q_param = tensor_wrapper_->GetQuantizeParams();
+    size_t stride[1]{sizeof(EncodingData)};
+
+    switch (q_param.quantizationEncoding) {
+      case QNN_QUANTIZATION_ENCODING_SCALE_OFFSET: {
+        Qnn_ScaleOffset_t data = q_param.scaleOffsetEncoding;
+        size_t shape[1]{1};
+        auto enc_data = py::array_t<EncodingData>(shape, stride);
+        auto view = enc_data.mutable_unchecked<1>();
+        view(0) = {data.scale, data.offset};
+        return {enc_data, -1};
+      }
+      case QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET: {
+        Qnn_AxisScaleOffset_t data = q_param.axisScaleOffsetEncoding;
+        size_t shape[1]{data.numScaleOffsets};
+        auto enc_data = py::array_t<EncodingData>(shape, stride);
+        auto view = enc_data.mutable_unchecked<1>();
+        for (int i = 0; i < enc_data.shape(0); ++i) {
+          view(i) = {data.scaleOffset[i].scale, data.scaleOffset[i].offset};
+        }
+        return {enc_data, data.axis};
+      }
+      case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET: {
+        Qnn_BwScaleOffset_t data = q_param.bwScaleOffsetEncoding;
+        size_t shape[1]{1};
+        auto enc_data = py::array_t<EncodingData>(shape, stride);
+        auto view = enc_data.mutable_unchecked<1>();
+        view(0) = {data.scale, data.offset};
+        return {enc_data, -1};
+      }
+      case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: {
+        Qnn_BwAxisScaleOffset_t data = q_param.bwAxisScaleOffsetEncoding;
+        size_t shape[1]{data.numElements};
+        auto enc_data = py::array_t<EncodingData>(shape, stride);
+        auto view = enc_data.mutable_unchecked<1>();
+        for (int i = 0; i < enc_data.shape(0); ++i) {
+          view(i) = {data.scales[i], data.offsets[i]};
+        }
+        return {enc_data, data.axis};
+      }
+      default:
+        QNN_EXECUTORCH_LOG_ERROR(
+            "%s QNN_QUANTIZATION_ENCODING_UNDEFINED detected",
+            GetName().c_str());
+        break;
+    }
+    return {};
+  }
+
+ private:
+  std::shared_ptr<TensorWrapper> tensor_wrapper_;
+};
 } // namespace qnn
 } // namespace executor
 } // namespace torch
diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.h b/backends/qualcomm/aot/wrappers/TensorWrapper.h
index 4aec5f71b7..8db5cfa6a7 100644
--- a/backends/qualcomm/aot/wrappers/TensorWrapper.h
+++ b/backends/qualcomm/aot/wrappers/TensorWrapper.h
@@ -75,7 +75,11 @@ class TensorWrapper {
     return QNN_VER_PTR(tensor_)->memType;
   };
 
-  std::string GetName() const {
+  Qnn_QuantizeParams_t GetQuantizeParams() const {
+    return QNN_VER_PTR(tensor_)->quantizeParams;
+  }
+
+  const std::string& GetName() const {
     return qnn_tensor_name_;
   };
 
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
index f36b0b64c2..dca47ebeec 100644
--- a/backends/qualcomm/builders/qnn_constants.py
+++ b/backends/qualcomm/builders/qnn_constants.py
@@ -30,6 +30,12 @@ class OpConcat:
     param_axis: str = "axis"
 
 
+@dataclass(init=False, frozen=True)
+class OpContextLoader:
+    namespace: str = "qaisw"
+    meta_ctx_bin: str = "qnn_context_binary"
+
+
 @dataclass(init=False, frozen=True)
 class OpConv2d:
     op_name: str = "Conv2d"
diff --git a/backends/qualcomm/qnn_preprocess.py b/backends/qualcomm/qnn_preprocess.py
index b3979afc58..6360b35280 100644
--- a/backends/qualcomm/qnn_preprocess.py
+++ b/backends/qualcomm/qnn_preprocess.py
@@ -9,8 +9,10 @@
 from typing import final, List
 
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager
-from executorch.backends.qualcomm.builders.node_visitor import get_node_visitors
 
+import torch  # noqa: F401
+from executorch.backends.qualcomm.builders.node_visitor import get_node_visitors
+from executorch.backends.qualcomm.builders.qnn_constants import OpContextLoader
 from executorch.backends.qualcomm.passes.convert_to_linear import ConvertToLinear
 from executorch.backends.qualcomm.passes.fuse_consecutive_transpose import (
     FuseConsecutiveTranspose,
@@ -77,9 +79,24 @@ def preprocess(
                         else:
                             py_op_wrapper_list.append(py_op_wrapper)
                 else:
-                    raise RuntimeError(
-                        f"For {node}, {node.op}:{node.target.__name__} is not supported in Qnn Delegate"
+                    err_msg = (
+                        f"For {node}, {node.op}:{node.target.__name__} "
+                        "is not supported in Qnn Delegate"
                     )
+                    try:
+                        context_loader_target = eval(
+                            f"torch.ops.{OpContextLoader.namespace}.{node.name}.default",
+                            globals().update(torch.__dict__),
+                        )
+                        assert node.target == context_loader_target, err_msg
+                        # if graph has context binary loader node, return directly
+                        return PreprocessResult(
+                            processed_bytes=node.meta[OpContextLoader.meta_ctx_bin],
+                            debug_handle_map={},
+                        )
+                    except:
+                        raise RuntimeError(err_msg)
+
             elif node.op in [
                 "get_attr",
                 "placeholder",
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
index feccfff9fa..f08f688cf9 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -190,8 +190,9 @@ Error QnnExecuTorchBackend::execute(
     if (qnn_manager->RegisterMem(
             args[i]->toTensor().mutable_data_ptr(), input_tensors[i]) !=
         Error::Ok) {
+      // update data ptr only should be fine
       input_tensors[i]->FillDataBuffer(
-          args[i]->toTensor().const_data_ptr(), true /* copy_data */);
+          args[i]->toTensor().const_data_ptr(), false /* copy_data */);
     }
     input_tensor_structs.push_back(input_tensors[i]->CloneTensorStruct());
   }
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index a77ec1a557..3dc135deb6 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -325,9 +325,16 @@ Error QnnManager::AllocateTensor() {
     std::sort(
         input_tensors_.begin(), input_tensors_.end(), CompareExportedInput);
   }
-  for (auto& tensor : output_tensors) {
-    std::shared_ptr<TensorWrapper> tensor_wrapper = CreateTensorWrapper(tensor);
-    tensor_wrapper->UpdateQnnTensorMeta(tensor);
+  for (size_t i = 0; i < output_tensors.size(); ++i) {
+    std::shared_ptr<TensorWrapper> tensor_wrapper =
+        CreateTensorWrapper(output_tensors[i]);
+    tensor_wrapper->UpdateQnnTensorMeta(output_tensors[i]);
+    const std::string& tensor_name = tensor_wrapper->GetName();
+    // this is required by identifying shared buffer mechanism
+    // info might be missed if context binary came from qnn_converter
+    if (tensor_name.find("output_") == std::string::npos) {
+      tensor_wrapper->SetName("output_" + tensor_name);
+    }
     if (IsTensorDump()) {
       tensor_wrapper->AllocateDataBuffer();
     }
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
index 8c7639460f..1ed51ed14f 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
@@ -29,7 +29,8 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary() {
   if (error != QNN_SUCCESS) {
     QNN_EXECUTORCH_LOG_WARN(
         "Failed to interpret QNN Context "
-        "binary. Error code %d",
+        "binary. Error code %d. "
+        "Try verifying binary with online-prepare format.",
         QNN_GET_ERROR_CODE(error));
     return Error::Internal;
   }
@@ -87,18 +88,6 @@ QnnBackendCache::QnnBackendCache(
     state_ = SERIALIZE;
     QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in SAVE MODE.");
     return;
-  } else {
-    // check if context binary came from flatbuffer
-    flatbuffers::FlatBufferBuilder builder;
-    flatbuffers::Verifier verifier(
-        static_cast<const uint8_t* const>(qnn_context_blob_.buffer),
-        qnn_context_blob_.nbytes);
-
-    if (qcir::VerifyGraphBuffer(verifier)) {
-      state_ = ONLINE_PREPARE;
-      QNN_EXECUTORCH_LOG_INFO("Verify context blob came from flatbuffer.");
-      return;
-    }
   }
 
   if (qnn_sys_impl_.Load() != Error::Ok) {
@@ -128,6 +117,17 @@ QnnBackendCache::QnnBackendCache(
   QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in RESTORE MODE.");
   Error status = GetQnnGraphInfoFromBinary();
   if (status == Error::Internal) {
+    // check if context binary came from flatbuffer
+    flatbuffers::FlatBufferBuilder builder;
+    flatbuffers::Verifier verifier(
+        static_cast<const uint8_t* const>(qnn_context_blob_.buffer),
+        qnn_context_blob_.nbytes);
+
+    if (qcir::VerifyGraphBuffer(verifier)) {
+      state_ = ONLINE_PREPARE;
+      return;
+    }
+
     QNN_EXECUTORCH_LOG_ERROR(
         "Failed to parse QNN Graph Info. The cache "
         "might be broken. Please consider to re-generate the "
diff --git a/backends/qualcomm/runtime/backends/QnnGraphCommon.cpp b/backends/qualcomm/runtime/backends/QnnGraphCommon.cpp
index 32bafda522..7ef072c28d 100644
--- a/backends/qualcomm/runtime/backends/QnnGraphCommon.cpp
+++ b/backends/qualcomm/runtime/backends/QnnGraphCommon.cpp
@@ -82,7 +82,7 @@ Error QnnGraph::EnsureTensorInQnnGraph(
 
     int name_conflict_count = 0;
     while (error == QNN_TENSOR_ERROR_NAME_HASH_COLLISION) {
-      std::string old_name = tensor_wrapper->GetName();
+      const std::string& old_name = tensor_wrapper->GetName();
 
       std::string new_name =
           old_name + "_" + std::to_string(name_conflict_count);
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
index b2c8e0d61c..00c4f07a14 100755
--- a/backends/qualcomm/scripts/build.sh
+++ b/backends/qualcomm/scripts/build.sh
@@ -16,7 +16,7 @@ usage() {
   echo "Usage: Build the aarch64 version of executor runner or the python interface of Qnn Manager"
   echo "First, you need to set the environment variable for QNN_SDK_ROOT"
   echo ", and if you want to build the aarch64 version of executor runner"
-  echo ", you need to set ANDROID_NDK"
+  echo ", you need to set ANDROID_NDK_ROOT"
   echo "e.g.: executorch$ ./backends/qualcomm/scripts/build.sh --skip_x86_64"
   exit 1
 }
@@ -56,8 +56,8 @@ done
 PRJ_ROOT="$( cd "$(dirname "$0")/../../.." ; pwd -P)"
 
 if [ "$BUILD_AARCH64" = true ]; then
-    if [[ -z ${ANDROID_NDK} ]]; then
-        echo "Please export ANDROID_NDK=/path/to/android_ndk"
+    if [[ -z ${ANDROID_NDK_ROOT} ]]; then
+        echo "Please export ANDROID_NDK_ROOT=/path/to/android_ndk"
         exit -1
     fi
     BUILD_ROOT=$PRJ_ROOT/$CMAKE_AARCH64
@@ -74,11 +74,10 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
-        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI='arm64-v8a' \
         -DANDROID_NATIVE_API_LEVEL=23 \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-        -DBUCK2=$BUCK2 \
         -B$BUILD_ROOT
 
     cmake --build $BUILD_ROOT -j16 --target install
@@ -87,13 +86,12 @@ if [ "$BUILD_AARCH64" = true ]; then
     CMAKE_PREFIX_PATH="${BUILD_ROOT}/lib/cmake/ExecuTorch;${BUILD_ROOT}/third-party/gflags;"
 
     cmake $PRJ_ROOT/$EXAMPLE_ROOT \
-        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI='arm64-v8a' \
         -DANDROID_NATIVE_API_LEVEL=23 \
         -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
         -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
         -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
-        -DBUCK2=$BUCK2 \
         -B$EXAMPLE_ROOT
 
     cmake --build $EXAMPLE_ROOT -j16
diff --git a/backends/qualcomm/setup.md b/backends/qualcomm/setup.md
index 2d8aff0ac5..4aca98a3b0 100644
--- a/backends/qualcomm/setup.md
+++ b/backends/qualcomm/setup.md
@@ -15,7 +15,7 @@ Please finish tutorial [Setting up executorch](../../docs/source/getting-started
 `$QNN_SDK_ROOT` refers to the root of Qualcomm AI Engine Direct SDK,
 i.e., the directory containing `QNN_README.txt`.
 
-`$ANDROID_NDK` refers to the root of Android NDK.
+`$ANDROID_NDK_ROOT` refers to the root of Android NDK.
 
 `$EXECUTORCH_ROOT` refers to the root of executorch git repository.
 
@@ -105,7 +105,7 @@ cmake .. \
     -DEXECUTORCH_BUILD_SDK=ON \
     -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
     -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI='arm64-v8a' \
     -DANDROID_NATIVE_API_LEVEL=23 \
     -B$PWD
@@ -113,7 +113,7 @@ cmake .. \
 cmake --build $PWD -j16 --target install
 
 cmake ../examples/qualcomm \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
     -DANDROID_ABI='arm64-v8a' \
     -DANDROID_NATIVE_API_LEVEL=23 \
     -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index 35fcb6bfc6..fe72b1e893 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -189,6 +189,19 @@ def forward(self, x, y):
         return CompositeReferenceModule(self.modules)
 
 
+class ContextBinaryExample(torch.nn.Module):
+    def forward(self, x, y):
+        x = torch.nn.functional.relu(x)
+        y = torch.nn.functional.relu(y)
+        return x, y
+
+    def example_inputs(self):
+        return {
+            "x": torch.randn((1, 3, 3, 3)),
+            "y": torch.randn((2, 1, 5, 5)),
+        }
+
+
 class Conv1dSequential(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 98deb8e11f..35b4ff03d0 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -13,6 +13,7 @@
 
 import torch
 from executorch.backends.qualcomm.tests.utils import (
+    generate_context_binary,
     QnnPartitioner,
     QuantDtype,
     TestQNN,
@@ -22,6 +23,7 @@
 from executorch.backends.qualcomm.utils.utils import (
     canonicalize_program,
     capture_program,
+    from_context_binary,
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
 )
@@ -600,8 +602,6 @@ def test_qnn_backend_16a4w_per_channel_linear(self):
         )
         self.lower_module_and_test_output(module, sample_input)
 
-    # Is not enabled in the current qnn sdk release
-    @unittest.expectedFailure
     def test_qnn_backend_16a4w_per_channel_linear_with_bias(self):
         module = Linear()  # noqa: F405
         sample_input = (torch.randn([3, 4]),)
@@ -1330,14 +1330,41 @@ def test_qnn_backend_online_prepare(self):
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.arch_table[TestQNN.model],
             backend_options=backend_options,
-            debug=False,
-            saver=False,
             online_prepare=True,
         )
         module = SimpleModel()  # noqa: F405
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_context_direct(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            module = ContextBinaryExample()  # noqa: F405
+            generate_context_binary(
+                module=module,
+                inputs=module.example_inputs(),
+                quantized=False,
+                artifact_dir=tmp_dir,
+            )
+            ctx_path = f"{tmp_dir}/model_ctx.bin"
+            bundle_program = from_context_binary(ctx_path, "ctx_loader")
+            backend_options = generate_htp_compiler_spec(use_fp16=True)
+            compiler_specs = generate_qnn_executorch_compiler_spec(
+                soc_model=self.arch_table[TestQNN.model],
+                backend_options=backend_options,
+                is_from_context_binary=True,
+            )
+            lowered_module = to_backend(
+                "QnnBackend", bundle_program["edge_program"], compiler_specs
+            )
+            self.verify_output(
+                module,
+                tuple(
+                    torch.randn(size=v.shape, dtype=v.dtype)
+                    for v in bundle_program["inputs"].values()
+                ),
+                lowered_module,
+            )
+
 
 class TestQNNQuantizedUtils(TestQNN):
     # TODO: refactor to support different backends
@@ -1470,8 +1497,6 @@ def test_qnn_backend_online_prepare(self):
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.arch_table[TestQNN.model],
             backend_options=backend_options,
-            debug=False,
-            saver=False,
             online_prepare=True,
         )
         module = SimpleModel()  # noqa: F405
@@ -1479,6 +1504,35 @@ def test_qnn_backend_online_prepare(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_context_direct(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            module = ContextBinaryExample()  # noqa: F405
+            generate_context_binary(
+                module=module,
+                inputs=module.example_inputs(),
+                quantized=True,
+                artifact_dir=tmp_dir,
+            )
+            ctx_path = f"{tmp_dir}/model_ctx.bin"
+            bundle_program = from_context_binary(ctx_path, "ctx_loader")
+            backend_options = generate_htp_compiler_spec(use_fp16=False)
+            compiler_specs = generate_qnn_executorch_compiler_spec(
+                soc_model=self.arch_table[TestQNN.model],
+                backend_options=backend_options,
+                is_from_context_binary=True,
+            )
+            lowered_module = to_backend(
+                "QnnBackend", bundle_program["edge_program"], compiler_specs
+            )
+            self.verify_output(
+                module,
+                tuple(
+                    torch.randn(size=v.shape, dtype=v.dtype)
+                    for v in bundle_program["inputs"].values()
+                ),
+                lowered_module,
+            )
+
 
 class TestExampleOssScript(TestQNN):
     def required_envs(self, conditions=None) -> bool:
@@ -2029,7 +2083,7 @@ def test_mobilebert(self):
             for k, v in cpu.items():
                 self.assertLessEqual(abs(v[0] - htp[k][0]), 2)
 
-    @unittest.expectedFailure
+    @unittest.skip("will be enabled after TODOs got resolved")
     def test_ptq_mobilebert(self):
         # TODO: 2 approaches to resolve accuracy issue
         # 1. fallback embedding layers:
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 295033e572..476532f48d 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -6,9 +6,10 @@
 import collections
 import copy
 import os
+import subprocess
 import tempfile
 import unittest
-from typing import Callable, List, Literal, Optional, Tuple
+from typing import Callable, Dict, List, Literal, Optional, Tuple
 
 import numpy as np
 import torch
@@ -31,6 +32,7 @@
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.lowered_backend_module import LoweredBackendModule
 from executorch.exir.pass_base import ExportPass
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from executorch.exir.program._program import ExecutorchProgram
@@ -39,6 +41,71 @@
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
+def generate_context_binary(
+    module: torch.nn.Module,
+    inputs: Dict[str, torch.Tensor],
+    quantized: bool,
+    artifact_dir: str,
+):
+    # we also expect clang showing in PATH or context may fail to generate
+    qnn_sdk = os.environ.get("QNN_SDK_ROOT", None)
+    ndk = os.environ.get("ANDROID_NDK_ROOT", None)
+    assert qnn_sdk, "QNN_SDK_ROOT was not found in environment variable"
+    assert ndk, "ANDROID_NDK_ROOT was not found in environment variable"
+
+    inputs_tup = tuple(inputs.values())
+    jit_module = torch.jit.trace(module, inputs_tup)
+    torch.jit.save(jit_module, f"{artifact_dir}/jit_module.pt")
+
+    # input data
+    if quantized:
+        input_list = []
+        for name, data in inputs.items():
+            file_name = f"{artifact_dir}/{name}.raw"
+            data.detach().numpy().tofile(file_name)
+            input_list.append(file_name)
+
+        with open(f"{artifact_dir}/input_list.txt", "w") as f:
+            f.write(" ".join(input_list))
+
+    # flow of qnn tools
+    target = "x86_64-linux-clang"
+    inputs_str = [
+        f"-d '{k}' {str(tuple(v.shape)).replace(' ', '')[1:-1]}"
+        for k, v in inputs.items()
+    ]
+    cmds = [
+        # setup qnn env
+        f"source {qnn_sdk}/bin/envsetup.sh;"
+        # qnn-pytorch-converter
+        f"{qnn_sdk}/bin/{target}/qnn-pytorch-converter",
+        f"-i {artifact_dir}/jit_module.pt",
+        *inputs_str,
+        f"--input_list {artifact_dir}/input_list.txt" if quantized else "",
+        "--preserve_io",
+        f"-o {artifact_dir}/model.cpp;",
+        # qnn-model-lib-generator
+        f"{qnn_sdk}/bin/{target}/qnn-model-lib-generator",
+        f"-c {artifact_dir}/model.cpp",
+        f"-t {target}",
+        "-l model",
+        f"-o {artifact_dir}/model_libs;",
+        # qnn-context-binary-generator
+        f"{qnn_sdk}/bin/{target}/qnn-context-binary-generator",
+        f"--model {artifact_dir}/model_libs/{target}/libmodel.so",
+        f"--backend {qnn_sdk}/lib/{target}/libQnnHtp.so",
+        "--binary_file model_ctx",
+        f"--output_dir {artifact_dir};",
+    ]
+    result = subprocess.run(
+        " ".join(cmds),
+        shell=True,
+        executable="/bin/bash",
+        capture_output=True,
+    )
+    assert os.path.isfile(f"{artifact_dir}/model_ctx.bin"), print(result.stderr)
+
+
 class TestQNN(unittest.TestCase):
     rtol: float = 0
     atol: float = 0
@@ -113,18 +180,23 @@ def verify_output(
         self,
         module: torch.nn.Module,
         sample_inputs: Tuple[torch.Tensor],
-        executorch_prog: ExecutorchProgram,
+        executorch_prog: ExecutorchProgram | LoweredBackendModule,
         etrecord_path: str = "etrecord.bin",
         expected_profile_events: int = -1,
     ):
         with tempfile.TemporaryDirectory() as tmp_dir:
+            buffer = (
+                executorch_prog.buffer
+                if isinstance(executorch_prog, ExecutorchProgram)
+                else executorch_prog.buffer()
+            )
             (
                 input_list,
                 ref_outputs,
                 pte_fname,
             ) = self._save_model_and_expected_output(
                 module,
-                executorch_prog.buffer,
+                buffer,
                 sample_inputs,
                 tmp_dir,
             )
@@ -134,8 +206,8 @@ def verify_output(
             etdump_path = f"{tmp_dir}/etdump.etdp"
 
             def post_process():
-                for i, _f in enumerate(os.listdir(device_output_dir)):
-                    filename = os.path.join(device_output_dir, f"output_0_{i}.raw")
+                for i, f in enumerate(sorted(os.listdir(device_output_dir))):
+                    filename = os.path.join(device_output_dir, f)
                     output = np.fromfile(filename, dtype=ref_outputs[i].numpy().dtype)
                     output = torch.from_numpy(output).reshape(ref_outputs[i].shape)
                     device_outputs.append(output)
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index dde852135b..b17c181ffd 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -4,12 +4,20 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from collections import OrderedDict
 from typing import Callable, Dict, List, Tuple
 
+import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor
+
 import executorch.exir as exir
 
 import torch
 
+from executorch.backends.qualcomm.builders.node_visitor import (
+    QNN_QUANT_TYPE_MAP,
+    QNN_TENSOR_TYPE_MAP,
+)
+from executorch.backends.qualcomm.builders.qnn_constants import OpContextLoader
 from executorch.backends.qualcomm.passes.annotate_and_quant_scalar import (
     AnnotateAndQuantScalar,
 )
@@ -49,9 +57,12 @@
 )
 from executorch.exir import ExirExportedProgram
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.lowered_backend_module import LoweredBackendModule
 from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions
 from torch.export.exported_program import ExportedProgram
 from torch.fx import passes
+from torch.library import Library
+
 
 QNN_COMPILE_SPEC = "qnn_compile_spec"
 
@@ -109,26 +120,58 @@ def replace_linear(module: torch.nn.Module):
     return replace_linear(module)
 
 
-def canonicalize_program(prog: ExportedProgram):
+def canonicalize_program(
+    exported_program: ExportedProgram | List[LoweredBackendModule],
+):
     # check if user specifies to use multi_contexts
     # this is a generic approach in case there exists multiple backends
-    max_sf_buf_size, modules = 0, {}
-    for _, m in prog.graph_module._modules.items():
-        # currently only 1 compile spec is expected in each partition
-        options = convert_to_option(m.compile_specs[0].value)
-        if (
-            options.backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend
-            and options.backend_options.htp_options.use_multi_contexts
-        ):
-            max_sf_buf_size = max(max_sf_buf_size, len(m.processed_bytes))
-            modules[m] = options
-
-    if max_sf_buf_size != 0:
-        for module, options in modules.items():
+    def get_program_info(program):
+        def process_exported_program(prog):
+            max_sf_buf_size, module_map = 0, {}
+            for _, m in prog.graph_module._modules.items():
+                # currently only 1 compile spec is expected in each partition
+                options = convert_to_option(m.compile_specs[0].value)
+                if (
+                    options.backend_options.backend_type
+                    == QnnExecuTorchBackendType.kHtpBackend
+                    and options.backend_options.htp_options.use_multi_contexts
+                ):
+                    max_sf_buf_size = max(max_sf_buf_size, len(m.processed_bytes))
+                    module_map[m] = options
+            return max_sf_buf_size, module_map
+
+        def process_lowered_module(module):
+            return len(module.processed_bytes), {
+                module: convert_to_option(module.compile_specs[0].value)
+            }
+
+        dispatch = {
+            ExportedProgram: process_exported_program,
+            LoweredBackendModule: process_lowered_module,
+        }
+        return dispatch[type(program)](program)
+
+    def update_program(max_sf_buf_size, module_map):
+        def set_spec(module, options):
+            spec = CompileSpec(QNN_COMPILE_SPEC, convert_to_flatbuffer(options))
+            if isinstance(module, ExportedProgram):
+                module.compile_specs[0] = spec
+            else:
+                module._compile_specs[0] = spec
+
+        for module, options in module_map.items():
             options.backend_options.htp_options.max_sf_buf_size = max_sf_buf_size
-            module.compile_specs[0] = CompileSpec(
-                QNN_COMPILE_SPEC, convert_to_flatbuffer(options)
-            )
+            set_spec(module, options)
+
+    if isinstance(exported_program, list):
+        max_sf_size, modules_map = 0, {}
+        for prog in exported_program:
+            max_sf_buf_size, module_map = get_program_info(prog)
+            max_sf_size = max(max_sf_size, max_sf_buf_size)
+            modules_map.update(module_map)
+        update_program(max_sf_size, modules_map)
+    else:
+        update_program(*get_program_info(exported_program))
 
 
 def get_decomp_table() -> Dict[torch._ops.OperatorBase, Callable]:
@@ -184,6 +227,84 @@ def capture_program(
     return edge_ep
 
 
+def from_context_binary(ctx_path: str, op_name: str):
+    def implement_op(custom_op, op_name, outputs):
+        @torch.library.impl(
+            custom_op, str(op_name), dispatch_key="CompositeExplicitAutograd"
+        )
+        def op_impl(inputs: List[torch.Tensor]):
+            return tuple(
+                torch.zeros(tuple(v.shape), device="meta", dtype=v.dtype)
+                for v in outputs.values()
+            )
+
+    def build_graph(inputs, outputs):
+        # custom op declaration
+        inputs_str = "Tensor[] inputs"
+        func_proto = f"{op_name}({inputs_str}) -> Any"
+        custom_op = Library(OpContextLoader.namespace, "FRAGMENT")
+        custom_op.define(func_proto)
+        # custom op implementation
+        implement_op(custom_op, op_name, outputs)
+
+        # model architecture mimicking context binary
+        class Model(torch.nn.Module):
+            def forward(self, *inputs):
+                return getattr(
+                    getattr(torch.ops, OpContextLoader.namespace), op_name
+                ).default(inputs)
+
+        model = Model()
+        prog = torch.export.export(model, tuple(inputs.values()))
+        # bookkeeping for variables' life cycle
+        return {
+            "custom_op": custom_op,
+            "custom_module": model,
+            "edge_program": prog,
+        }
+
+    def build_tensor(tensors, dtype_map):
+        ret = OrderedDict()
+        for t in tensors:
+            dtype = t.GetDataType()
+            dtype_torch = dtype_map.get(dtype, None)
+            assert dtype_torch is not None, f"unknown qnn data type {dtype}"
+            ret[t.GetName()] = torch.zeros(tuple(t.GetDims()), dtype=dtype_torch)
+
+        return ret
+
+    with open(ctx_path, "rb") as f:
+        ctx_bin = f.read()
+    # dummy compiler spec would be fine, since we're not compiling
+    backend_options = generate_htp_compiler_spec(use_fp16=False)
+    compiler_specs = generate_qnn_executorch_compiler_spec(
+        soc_model=QcomChipset.SM8650,
+        backend_options=backend_options,
+        is_from_context_binary=True,
+    )
+    # get context-binary io tensor info through qnn manager
+    qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+        generate_qnn_executorch_option(compiler_specs), ctx_bin
+    )
+    assert qnn_mgr.Init().value == 0, "failed to load context binary"
+    qnn_mgr.AllocateTensor()
+    dtype_map = {}
+    for type_map in (QNN_QUANT_TYPE_MAP, QNN_TENSOR_TYPE_MAP):
+        for k, v in type_map.items():
+            dtype_map.setdefault(v, k)
+    inputs = build_tensor(qnn_mgr.GetGraphInputs(), dtype_map)
+    outputs = build_tensor(qnn_mgr.GetGraphOutputs(), dtype_map)
+    qnn_mgr.Destroy()
+    # generate graph specific for loading context
+    bundle_prog = build_graph(inputs, outputs)
+    bundle_prog.update({"inputs": inputs, "outputs": outputs})
+    for n in bundle_prog["edge_program"].graph_module.graph.nodes:
+        if op_name in n.name:
+            n.meta[OpContextLoader.meta_ctx_bin] = ctx_bin
+            break
+    return bundle_prog
+
+
 def draw_graph(title, path, graph_module: torch.fx.GraphModule):
     graph = passes.graph_drawer.FxGraphDrawer(graph_module, title)
     with open(f"{path}/{title}.svg", "wb") as f:
@@ -249,6 +370,7 @@ def generate_qnn_executorch_compiler_spec(
     tensor_dump_output_path: str = "",
     profile: bool = False,
     shared_buffer: bool = False,
+    is_from_context_binary: bool = False,
 ) -> List[CompileSpec]:
     """
     Helper function generating compiler specs for Qualcomm AI Engine Direct
@@ -311,9 +433,6 @@ def generate_qnn_executorch_compiler_spec(
     else:
         qnn_executorch_options.profile_level = QnnExecuTorchProfileLevel.kProfileOff
 
-    if shared_buffer:
-        qnn_executorch_options.shared_buffer = True
-
     if (
         online_prepare
         and backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend
@@ -324,7 +443,9 @@ def generate_qnn_executorch_compiler_spec(
             "please set 'online_prepare' to False"
         )
 
+    qnn_executorch_options.shared_buffer = shared_buffer
     qnn_executorch_options.online_prepare = online_prepare
+    qnn_executorch_options.is_from_context_binary = is_from_context_binary
 
     return [
         CompileSpec(QNN_COMPILE_SPEC, convert_to_flatbuffer(qnn_executorch_options))
diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt
index 7234632a12..4849b55afe 100644
--- a/examples/qualcomm/CMakeLists.txt
+++ b/examples/qualcomm/CMakeLists.txt
@@ -44,7 +44,7 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
 #
 set(EXECUTORCH_SRCS_FILE
-    "${CMAKE_CURRENT_BINARY_DIR}/../../executorch_srcs.cmake"
+  "${CMAKE_CURRENT_BINARY_DIR}/../../executorch_srcs.cmake"
 )
 extract_sources(${EXECUTORCH_SRCS_FILE})
 include(${EXECUTORCH_SRCS_FILE})
@@ -54,6 +54,7 @@ get_filename_component(
 )
 set(_qnn_executor_runner__srcs ${_executor_runner__srcs})
 set(_qnn_llama_runner__srcs ${_llama_runner__srcs})
+set(_qnn_qaihub_llama_runner__srcs ${_llama_runner__srcs})
 
 # portable_ops_lib
 gen_selected_ops(LIB_NAME "full_portable_ops_lib" INCLUDE_ALL_OPS "ON")
@@ -71,73 +72,69 @@ target_include_directories(
   full_portable_ops_lib PUBLIC ${_common_include_directories}
 )
 
-# prerpocess executor runner src files
-list(
-    TRANSFORM
-    _qnn_executor_runner__srcs
-    PREPEND
-    "${EXECUTORCH_SOURCE_DIR}/"
-)
-list(
-    FILTER
-    _qnn_executor_runner__srcs
-    EXCLUDE REGEX
-    ".*executor_runner.cpp$"
-)
-list(
-    PREPEND
-    _qnn_executor_runner__srcs
-    ${CMAKE_CURRENT_LIST_DIR}/executor_runner/qnn_executor_runner.cpp
+# preprocess executor runner src files
+list(TRANSFORM _qnn_executor_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
+list(FILTER _qnn_executor_runner__srcs EXCLUDE REGEX ".*executor_runner.cpp$")
+list(PREPEND _qnn_executor_runner__srcs
+  ${CMAKE_CURRENT_LIST_DIR}/executor_runner/qnn_executor_runner.cpp
 )
-
 # preprocess llama runner src files
-list(
-    TRANSFORM
-    _qnn_llama_runner__srcs
-    PREPEND
-    "${EXECUTORCH_SOURCE_DIR}/"
-)
-list(
-    FILTER
-    _qnn_llama_runner__srcs
-    EXCLUDE REGEX
-    ".*runner.cpp$"
-)
-list(
-    PREPEND
-    _qnn_llama_runner__srcs
-    ${CMAKE_CURRENT_LIST_DIR}/executor_runner/qnn_llama_runner.cpp
-    ${CMAKE_CURRENT_LIST_DIR}/llama2/runner/runner.cpp
-    ${CMAKE_CURRENT_LIST_DIR}/llama2/runner/runner.h
+list(TRANSFORM _qnn_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
+list(FILTER _qnn_llama_runner__srcs EXCLUDE REGEX ".*runner.cpp$")
+list(PREPEND _qnn_llama_runner__srcs
+  ${CMAKE_CURRENT_LIST_DIR}/executor_runner/qnn_llama_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/llama2/runner/runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/llama2/runner/runner.h
+)
+# preprocess qaihub llama runner src files
+list(TRANSFORM _qnn_qaihub_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
+list(FILTER _qnn_qaihub_llama_runner__srcs EXCLUDE REGEX ".*runner.cpp*$")
+list(PREPEND _qnn_qaihub_llama_runner__srcs
+  ${CMAKE_CURRENT_LIST_DIR}/executor_runner/qnn_qaihub_llama_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_runner/runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_runner/runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_runner/io_memory.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_runner/io_memory.h
 )
 
 # build executor runner
 add_executable(qnn_executor_runner ${_qnn_executor_runner__srcs})
-target_include_directories(qnn_executor_runner
-    PUBLIC
-    ${_common_include_directories}
+target_include_directories(
+  qnn_executor_runner PUBLIC ${_common_include_directories}
 )
 target_link_libraries(
   qnn_executor_runner qnn_executorch_backend full_portable_ops_lib etdump
   ${FLATCCRT_LIB} gflags
 )
-target_link_options(
-  qnn_executor_runner PUBLIC -fsanitize=undefined)
+target_link_options(qnn_executor_runner PUBLIC -fsanitize=undefined)
 
 # build llama runner
 add_executable(qnn_llama_runner ${_qnn_llama_runner__srcs})
-target_include_directories(qnn_llama_runner
-    PUBLIC
-    ${_common_include_directories}
+target_include_directories(
+  qnn_llama_runner PUBLIC ${_common_include_directories}
 )
 target_link_libraries(qnn_llama_runner
-    qnn_executorch_backend
-    full_portable_ops_lib
-    extension_data_loader
-    extension_module
-    gflags
+  qnn_executorch_backend
+  full_portable_ops_lib
+  extension_data_loader
+  extension_module
+  gflags
 )
 target_compile_options(qnn_llama_runner
-    PUBLIC
-    ${_common_compile_options}
+  PUBLIC ${_common_compile_options}
+)
+# build qaihub llama runner
+add_executable(qnn_qaihub_llama_runner ${_qnn_qaihub_llama_runner__srcs})
+target_include_directories(qnn_qaihub_llama_runner
+  PUBLIC ${_common_include_directories}
+)
+target_link_libraries(qnn_qaihub_llama_runner
+  qnn_executorch_backend
+  executorch_no_prim_ops
+  extension_data_loader
+  extension_module
+  gflags
+)
+target_compile_options(qnn_qaihub_llama_runner
+  PUBLIC ${_common_compile_options}
 )
diff --git a/examples/qualcomm/executor_runner/qnn_qaihub_llama_runner.cpp b/examples/qualcomm/executor_runner/qnn_qaihub_llama_runner.cpp
new file mode 100644
index 0000000000..60e0b66a9b
--- /dev/null
+++ b/examples/qualcomm/executor_runner/qnn_qaihub_llama_runner.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ *
+ * This tool can run ExecuTorch model files with Qualcomm AI Engine Direct
+ * and the portable kernels.
+ *
+ * User could specify arguments like desired input data, iterations, etc.
+ * Currently we assume that the outputs are all fp32 tensors.
+ */
+
+#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
+#include <executorch/examples/qualcomm/llama2/qaihub_runner/runner.h>
+#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/runtime/platform/log.h>
+
+#include <gflags/gflags.h>
+
+#include <fstream>
+
+DEFINE_string(sharded_1_path, "", "Path to 1st sharded pte file");
+DEFINE_string(sharded_2_path, "", "Path to 2nd sharded pte file");
+DEFINE_string(sharded_3_path, "", "Path to 3rd sharded pte file");
+DEFINE_string(sharded_4_path, "", "Path to 4th sharded pte file");
+
+DEFINE_string(freq_cos_path, "", "Path to precomputed position embeddings");
+DEFINE_string(freq_sin_path, "", "Path to precomputed position embeddings");
+
+DEFINE_string(output_path, "outputs", "Executorch inference data output path.");
+DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
+DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
+DEFINE_double(
+    temperature,
+    0.8f,
+    "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
+DEFINE_int32(
+    eval_mode,
+    0,
+    "0: PromptProcessor / 1: TokenGenerator / 2: MixedMode (TBD)");
+DEFINE_int32(
+    seq_len,
+    128,
+    "Total number of tokens to generate (prompt + output). Defaults to max_seq_len. If the number of input tokens + seq_len > max_seq_len, the output will be truncated to max_seq_len tokens.");
+DEFINE_double(logits_scale, 0.0, "Path to logits scale file");
+DEFINE_int32(logits_offset, 0, "Path to logits offset file");
+
+int main(int argc, char** argv) {
+  using namespace torch::executor;
+
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  std::vector<std::string> models_path = {
+      FLAGS_sharded_1_path,
+      FLAGS_sharded_2_path,
+      FLAGS_sharded_3_path,
+      FLAGS_sharded_4_path};
+  std::vector<std::string> pos_embs_path = {
+      FLAGS_freq_cos_path, FLAGS_freq_sin_path};
+
+  // create llama runner
+  Runner runner(
+      models_path,
+      pos_embs_path,
+      FLAGS_tokenizer_path.c_str(),
+      FLAGS_eval_mode,
+      FLAGS_temperature,
+      FLAGS_logits_scale,
+      FLAGS_logits_offset);
+
+  // generate tokens & store inference output
+  std::ofstream fout(FLAGS_output_path.c_str());
+  runner.generate(FLAGS_prompt, FLAGS_seq_len, [&](const std::string& piece) {
+    fout << piece;
+  });
+  fout.close();
+  return 0;
+}
diff --git a/examples/qualcomm/llama2/README.md b/examples/qualcomm/llama2/README.md
index cc03d2208a..4670f74251 100644
--- a/examples/qualcomm/llama2/README.md
+++ b/examples/qualcomm/llama2/README.md
@@ -1,20 +1,21 @@
 # Summary
 
 ## Overview
-This file provides you the instructions to run LLAMA2 with different parameters via Qualcomm HTP backend. The following setting is the support
+This file provides you the instructions to run LLAMA2 with different parameters via Qualcomm HTP backend. Following settings support for
 1. Stories 110M
+2. Llama-2-7b-chat-hf
 
 Please check corresponding section for more information.
 
 ## Stories 110M
 This example demonstrates how to run a smaller LLAMA2, stories110M on mobile via Qualcomm HTP backend. Model architecture is fine-tuned specifically for HTP to accelerate the performance. Weight is quantized via PTQ quantization to fit the model on a phone.
 
-## Instructions
-### Step 1: Set up
+### Instructions
+#### Step 1: Setup
 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
-2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm  AI Engine Direct Backend.
+2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
 
-### Step2: Prepare Model
+#### Step2: Prepare Model
 Download and preapre stories110M model
 
 ```bash
@@ -29,12 +30,38 @@ python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
 echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
 ```
 
-### Step3: Run default examples
+#### Step3: Run default examples
 Default example generates the story based on the given prompt, "Once".
 ```bash
 # 16a4w quant:
-python examples/qualcomm/llama2/llama.py -a ${ARTIFACTS} -b build_android -s ${SERIAL_NUM} -H ${HOST_NAME} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once"
+python examples/qualcomm/llama2/llama.py -a ${ARTIFACTS} -b build_android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once"
 ```
 
-### (Note) Customized PTQ data set
+#### (Note) Customized PTQ data set
 User prompts are used for PTQ calibration data. Take the examples above, the word "Once" is the only word for PTQ. If you want to observe more data during the calibration time. Please add more prompts to the args `--prompt`.
+
+
+## Llama-2-7b-chat-hf
+This example demonstrates how to run Llama-2-7b-chat-hf on mobile via Qualcomm HTP backend. Model was precompiled into context binaries by [Qualcomm AI HUB](https://aihub.qualcomm.com/).
+Note that the pre-compiled context binaries could not be futher fine-tuned for other downstream tasks.
+
+### Instructions
+#### Step 1: Setup
+1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
+2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
+
+#### Step2: Prepare Model
+1. Create account for https://aihub.qualcomm.com/
+2. Follow instructions in https://huggingface.co/qualcomm/Llama-v2-7B-Chat to export context binaries (will take some time to finish)
+
+```bash
+# tokenizer.model: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/tokenizer.model
+# tokenizer.bin:
+python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+```
+
+#### Step3: Run default examples
+```bash
+# AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v2_7b_chat_quantized
+python examples/qualcomm/llama2/llama_qaihub.py -a ${ARTIFACTS} -b build_android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_bin tokenizer.bin --prompt "What is Python?"
+```
diff --git a/examples/qualcomm/llama2/llama_qaihub.py b/examples/qualcomm/llama2/llama_qaihub.py
new file mode 100644
index 0000000000..b5dd6ab458
--- /dev/null
+++ b/examples/qualcomm/llama2/llama_qaihub.py
@@ -0,0 +1,235 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import gc
+import os
+
+import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor
+
+import torch
+from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (  # noqa: F401
+    QcomChipset,
+)
+from executorch.backends.qualcomm.utils.utils import (
+    canonicalize_program,
+    from_context_binary,
+    generate_htp_compiler_spec,
+    generate_qnn_executorch_compiler_spec,
+    generate_qnn_executorch_option,
+)
+from executorch.examples.qualcomm.scripts.utils import (
+    setup_common_args_and_variables,
+    SimpleADB,
+)
+from executorch.exir.backend.backend_api import to_backend
+from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
+
+
+def main():
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./llama2_qai_hub",
+        default="./llama2_qai_hub",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--context_binaries",
+        help="path to context binaries generated from qai_hub",
+        required=True,
+    )
+
+    parser.add_argument(
+        "--use_prompt_processor",
+        help="tokens will be evaluated all at once",
+        default=False,
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--tokenizer_bin",
+        help="llama2 tokenizer binary",
+        required=True,
+        type=str,
+    )
+
+    parser.add_argument(
+        "--seq_len",
+        help="ouput sequence length for llama2",
+        default=128,
+        type=int,
+    )
+
+    parser.add_argument(
+        "--temperature",
+        help="sampling temperature for llama2",
+        default=0.8,
+        type=float,
+    )
+
+    parser.add_argument(
+        "--prompt",
+        help="user prompts for llama2",
+        required=True,
+        type=str,
+    )
+
+    parser.add_argument(
+        "--pre_gen_pte",
+        help="folder path to pre-compiled ptes",
+        default=None,
+        type=str,
+    )
+
+    args = parser.parse_args()
+    target_names = (
+        [
+            f"llama_v2_7b_chat_quantized_Llama2_PromptProcessor_{i}_Quantized.bin"
+            for i in range(1, 5)
+        ]
+        if args.use_prompt_processor
+        else [
+            f"llama_v2_7b_chat_quantized_Llama2_TokenGenerator_{i}_Quantized.bin"
+            for i in range(1, 5)
+        ]
+    )
+
+    # common part for compile & inference
+    backend_options = generate_htp_compiler_spec(
+        use_fp16=False,
+        use_multi_contexts=True,
+    )
+    compiler_specs = generate_qnn_executorch_compiler_spec(
+        soc_model=getattr(QcomChipset, args.model),
+        backend_options=backend_options,
+        is_from_context_binary=True,
+    )
+
+    if args.pre_gen_pte is None:
+        # create custom operators as context loader
+        bundle_programs = [
+            from_context_binary(f"{args.context_binaries}/{target}", f"ctx_loader_{i}")
+            for i, target in enumerate(target_names)
+        ]
+        # lower with QnnBackend
+        lowered_modules = [
+            to_backend("QnnBackend", prog["edge_program"], compiler_specs)
+            for prog in bundle_programs
+        ]
+        # setup spill-fill buffer for relieving runtime memory usage
+        canonicalize_program(lowered_modules)
+        # export pte files
+        pte_name, pte_files = "qaihub_llama7b", []
+        for i in range(len(target_names)):
+            memory_planning_pass = MemoryPlanningPass(
+                memory_planning_algo="greedy",
+                alloc_graph_input=False,
+                alloc_graph_output=False,
+            )
+            pte_files.append(f"{args.artifact}/{pte_name}_{i}.pte")
+            with open(pte_files[-1], "wb") as file:
+                file.write(
+                    lowered_modules[0].buffer(memory_planning=memory_planning_pass)
+                )
+            # gc for reducing host memory consuming
+            bundle_programs.pop(0)
+            lowered_modules.pop(0)
+            gc.collect()
+    else:
+        pte_name = "qaihub_llama7b"
+        pte_files = [f"{args.pre_gen_pte}/{pte_name}_{i}.pte" for i in range(4)]
+
+    if args.compile_only:
+        return
+
+    def get_logit_encoding(path_to_last_shard: str):
+        with open(f"{args.context_binaries}/{path_to_last_shard}", "rb") as f:
+            ctx_bin = f.read()
+            qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+                generate_qnn_executorch_option(compiler_specs), ctx_bin
+            )
+            assert qnn_mgr.Init().value == 0, "failed to load context binary"
+            qnn_mgr.AllocateTensor()
+            logits = qnn_mgr.GetGraphOutputs()[-1]
+            encoding = logits.GetEncodings()
+            qnn_mgr.Destroy()
+            return encoding.data["scale"].item(), encoding.data["offset"].item()
+
+    # setup required paths accordingly
+    # qnn_sdk       : QNN SDK path setup in environment variable
+    # artifact_path : path where artifacts were built
+    # pte_path      : path where executorch binary was stored
+    # device_id     : serial number of android device
+    # workspace     : folder for storing artifacts on android device
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=args.build_folder,
+        pte_path=pte_files,
+        workspace=f"/data/local/tmp/executorch/{pte_name}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+        runner="examples/qualcomm/qnn_qaihub_llama_runner",
+    )
+    output_file = "result.txt"
+    pos_embs_file = ["freq_cos", "freq_sin"]
+    scale, offset = get_logit_encoding(target_names[-1])
+    runner_args = [
+        *[
+            f"--sharded_{i+1}_path {os.path.basename(pte_file)}"
+            for i, pte_file in enumerate(pte_files)
+        ],
+        *[f"--{fname}_path {fname}.raw" for fname in pos_embs_file],
+        f"--output_path {adb.output_folder}/{output_file}",
+        f"--tokenizer_path {os.path.basename(args.tokenizer_bin)}",
+        f"--prompt '{args.prompt}'",
+        f"--temperature {args.temperature}",
+        f"--seq_len {args.seq_len}",
+        f"--eval_mode {0 if args.use_prompt_processor else 1}",
+        f"--logits_scale {scale}",
+        f"--logits_offset {-offset}",
+    ]
+    runner_cmds = " ".join(
+        [
+            f"cd {adb.workspace} &&",
+            "export ADSP_LIBRARY_PATH=. &&",
+            "export LD_LIBRARY_PATH=. &&",
+            f"./qnn_qaihub_llama_runner {' '.join(runner_args)}",
+        ]
+    )
+
+    def compute_pos_embedding():
+        head_dim, max_seq_len, theta = 128, 1024, 10000.0
+        base = torch.arange(0, head_dim, 2)
+        freqs = 1.0 / (theta ** (base[: (head_dim // 2)].float() / head_dim))
+        t = torch.arange(max_seq_len * 2)
+        freqs = torch.outer(t, freqs).float()
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+        freqs_cis = freqs_cis[0:max_seq_len]
+        freqs_real = torch.view_as_real(freqs_cis)
+        return freqs_real[:, :, 0], freqs_real[:, :, 1]
+
+    def post_process():
+        with open(f"{args.artifact}/outputs/{output_file}", "r") as f:
+            print(f.read())
+
+    custom_files = [args.tokenizer_bin]
+    for var_name, freq in zip(pos_embs_file, compute_pos_embedding()):
+        custom_files.append(f"{adb.working_dir}/{var_name}.raw")
+        scale, offset = (freq.max() - freq.min()) / 65535, 32768
+        freq = (freq / scale + offset).clip(min=0, max=65535).detach()
+        freq.to(dtype=torch.uint16).numpy().tofile(custom_files[-1])
+
+    adb.push(files=custom_files)
+    adb.execute(custom_runner_cmd=runner_cmds)
+    adb.pull(args.artifact, callback=post_process)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/qualcomm/llama2/qaihub_runner/io_memory.cpp b/examples/qualcomm/llama2/qaihub_runner/io_memory.cpp
new file mode 100644
index 0000000000..c7e3df9996
--- /dev/null
+++ b/examples/qualcomm/llama2/qaihub_runner/io_memory.cpp
@@ -0,0 +1,481 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <fstream>
+
+#include <executorch/examples/qualcomm/llama2/qaihub_runner/io_memory.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+namespace torch {
+namespace executor {
+
+Memory::Memory(
+    const std::vector<std::string>& pos_embs_path,
+    std::vector<std::shared_ptr<Module>>& modules)
+    : data_ptr_(nullptr, [](void*) {}),
+      input_tensors_(4),
+      output_tensors_(4),
+      pos_embs_path_(pos_embs_path),
+      modules_(modules) {}
+
+Memory::~Memory() {}
+
+void* Memory::get_mutable_ptr() {
+  return data_ptr_.get();
+}
+
+std::vector<Tensor> Memory::get_input_tensors(int shard_index) {
+  std::vector<Tensor> ret;
+  ret.reserve(input_tensors_.size());
+  for (TensorImpl* impl : input_tensors_[shard_index]) {
+    ret.emplace_back(Tensor(impl));
+  }
+  return ret;
+}
+
+std::vector<Tensor> Memory::get_output_tensors(int shard_index) {
+  std::vector<Tensor> ret;
+  ret.reserve(output_tensors_.size());
+  for (TensorImpl* impl : output_tensors_[shard_index]) {
+    ret.emplace_back(Tensor(impl));
+  }
+  return ret;
+}
+
+BertMemory::BertMemory(
+    const std::vector<std::string>& pos_embs_path,
+    std::vector<std::shared_ptr<Module>>& modules)
+    : Memory(pos_embs_path, modules) {
+  data_ptr_ = std::unique_ptr<void, void (*)(void*)>(
+      new IO, [](void* ptr) { delete static_cast<IO*>(ptr); });
+}
+
+void BertMemory::prepare_io(
+    const std::vector<Result<MethodMeta>>& methods_meta) {
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  std::memset(ptr, 0, sizeof(IO));
+
+  for (int i = 0; i < 4; ++i) {
+    ET_CHECK_MSG(
+        methods_meta[i].ok(),
+        "Failed to get method_meta 0x%x",
+        static_cast<uint32_t>(methods_meta[i].error()));
+  }
+  // [I] position embedding initialization
+  for (size_t i = 0; i < pos_embs_path_.size(); ++i) {
+    std::ifstream fin(pos_embs_path_[i], std::ios::binary);
+    fin.read(
+        reinterpret_cast<char*>(
+            i == 0 ? ptr->position_ids_cos : ptr->position_ids_sin),
+        1024 * 64 * 2);
+    fin.close();
+  }
+  // [I]: shard1,2,3,4
+  {
+    // [I]: input_ids
+    Result<TensorInfo> input_ids = methods_meta[0]->input_tensor_meta(0);
+    input_ids_ = std::make_unique<TensorImpl>(
+        input_ids->scalar_type(),
+        input_ids->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(input_ids->sizes().data()),
+        ptr->input_ids,
+        const_cast<TensorImpl::DimOrderType*>(input_ids->dim_order().data()));
+    input_tensors_[0].push_back(input_ids_.get());
+    // [I]: atten_mask
+    Result<TensorInfo> atten_mask = methods_meta[0]->input_tensor_meta(1);
+    attention_mask_ = std::make_unique<TensorImpl>(
+        atten_mask->scalar_type(),
+        atten_mask->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(atten_mask->sizes().data()),
+        ptr->attention_mask,
+        const_cast<TensorImpl::DimOrderType*>(atten_mask->dim_order().data()));
+    input_tensors_[0].push_back(attention_mask_.get());
+    // [I]: pos_ids_cos
+    Result<TensorInfo> pos_ids_cos = methods_meta[0]->input_tensor_meta(2);
+    position_ids_cos_ = std::make_unique<TensorImpl>(
+        pos_ids_cos->scalar_type(),
+        pos_ids_cos->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(pos_ids_cos->sizes().data()),
+        ptr->position_ids_cos,
+        const_cast<TensorImpl::DimOrderType*>(pos_ids_cos->dim_order().data()));
+    input_tensors_[0].push_back(position_ids_cos_.get());
+    // [I]: pos_ids_sin
+    Result<TensorInfo> pos_ids_sin = methods_meta[0]->input_tensor_meta(3);
+    position_ids_sin_ = std::make_unique<TensorImpl>(
+        pos_ids_sin->scalar_type(),
+        pos_ids_sin->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(pos_ids_sin->sizes().data()),
+        ptr->position_ids_sin,
+        const_cast<TensorImpl::DimOrderType*>(pos_ids_sin->dim_order().data()));
+    input_tensors_[0].push_back(position_ids_sin_.get());
+    // [IO]: hidden_state => [I] shard2,3,4
+    int output_index = 8 * 2 * 32; // layres*(k + v caches)*heads
+    Result<TensorInfo> hidden_state =
+        methods_meta[0]->output_tensor_meta(output_index);
+    hidden_state_ = std::make_unique<TensorImpl>(
+        hidden_state->scalar_type(),
+        hidden_state->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(hidden_state->sizes().data()),
+        ptr->hidden_state,
+        const_cast<TensorImpl::DimOrderType*>(
+            hidden_state->dim_order().data()));
+    // reuse inputs for following tensors
+    for (int shard_index = 1; shard_index < 4; ++shard_index) {
+      // inpus of shard1,2,3: hidden_state, atten_mask, pos_ids_cos, pos_ids_sin
+      input_tensors_[shard_index].push_back(hidden_state_.get());
+      input_tensors_[shard_index].push_back(attention_mask_.get());
+      input_tensors_[shard_index].push_back(position_ids_cos_.get());
+      input_tensors_[shard_index].push_back(position_ids_sin_.get());
+    }
+  }
+  // [O] kv_cache for shard1,2,3,4
+  for (int offset = 0, shard_index = 0; shard_index < 4;
+       offset += 8, shard_index++) {
+    for (int layer = 0; layer < 8; ++layer) {
+      for (int cache_group = 0; cache_group < 2; ++cache_group) {
+        for (int head = 0; head < 32; ++head) {
+          int index = 64 * layer + cache_group * 32 + head;
+          Result<TensorInfo> kv_cache =
+              methods_meta[shard_index]->output_tensor_meta(index);
+          std::vector<std::unique_ptr<TensorImpl>>& cache =
+              (cache_group == 0 ? v_cache_ : k_cache_);
+          cache.emplace_back(std::make_unique<TensorImpl>(
+              kv_cache->scalar_type(),
+              kv_cache->sizes().size(),
+              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+              cache_group == 0 ? ptr->v_cache[layer + offset][head]
+                               : ptr->k_cache[layer + offset][head],
+              const_cast<TensorImpl::DimOrderType*>(
+                  kv_cache->dim_order().data())));
+          output_tensors_[shard_index].push_back(cache.back().get());
+        }
+      }
+    }
+  }
+  // [O]: hidden_state for shard1,2,3
+  for (int shard_index = 0; shard_index < 3; ++shard_index) {
+    output_tensors_[shard_index].push_back(hidden_state_.get());
+  }
+  // [O]: logits
+  {
+    int output_index = 8 * 2 * 32; // layers*(k + v caches)*heads
+    Result<TensorInfo> logits =
+        methods_meta[3]->output_tensor_meta(output_index);
+    logits_ = std::make_unique<TensorImpl>(
+        logits->scalar_type(),
+        logits->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
+        ptr->logits,
+        const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
+    output_tensors_[3].push_back(logits_.get());
+  }
+}
+
+void BertMemory::update_io(
+    int64_t cur_token,
+    int64_t pos,
+    std::vector<std::vector<Tensor>>& output_tensors) {
+  (void)output_tensors;
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  static int num_tokens_generated = 0;
+  int seq_len = 1024, last_index = seq_len - 1;
+  // refill past token ids, which is equivalent to following snippet:
+  // --->
+  // for (int i = 0; i < last_index; ++i) {
+  //   ptr->input_ids[i] = ptr->input_ids[i + 1];
+  // }
+  // ptr->input_ids[last_index] = static_cast<int32_t>(cur_token);
+  // <---
+  int32_t* new_addr = ++num_tokens_generated + ptr->input_ids;
+  new_addr[last_index] = static_cast<int32_t>(cur_token);
+  input_ids_->set_data(new_addr);
+  // update causal mask for next token
+  int tokens = pos + 1, start = last_index - tokens;
+  for (int i = last_index; tokens >= 0; --i, --tokens) {
+    ptr->attention_mask[i * seq_len + start] = 65535;
+  }
+}
+
+KVCachedMemory::KVCachedMemory(
+    const std::vector<std::string>& pos_embs_path,
+    std::vector<std::shared_ptr<Module>>& modules)
+    : Memory(pos_embs_path, modules) {
+  data_ptr_ = std::unique_ptr<void, void (*)(void*)>(
+      new IO, [](void* ptr) { delete static_cast<IO*>(ptr); });
+  futures_ = std::vector<std::future<void>>(thread_pool_.num_workers());
+}
+
+void KVCachedMemory::prepare_io(
+    const std::vector<Result<MethodMeta>>& methods_meta) {
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  std::memset(ptr, 0, sizeof(IO));
+
+  for (int i = 0; i < 4; ++i) {
+    ET_CHECK_MSG(
+        methods_meta[i].ok(),
+        "Failed to get method_meta 0x%x",
+        static_cast<uint32_t>(methods_meta[i].error()));
+  }
+  // [I] position embedding initialization
+  for (size_t i = 0; i < pos_embs_path_.size(); ++i) {
+    std::ifstream fin(pos_embs_path_[i], std::ios::binary);
+    fin.read(
+        reinterpret_cast<char*>(
+            i == 0 ? ptr->position_ids_cos : ptr->position_ids_sin),
+        1024 * 64 * 2);
+    fin.close();
+  }
+  // [I]: shard1,2,3,4
+  {
+    // [I]: input_ids
+    Result<TensorInfo> input_ids = methods_meta[0]->input_tensor_meta(0);
+    input_ids_ = std::make_unique<TensorImpl>(
+        input_ids->scalar_type(),
+        input_ids->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(input_ids->sizes().data()),
+        &ptr->input_ids,
+        const_cast<TensorImpl::DimOrderType*>(input_ids->dim_order().data()));
+    input_tensors_[0].push_back(input_ids_.get());
+    // [I]: atten_mask
+    Result<TensorInfo> atten_mask = methods_meta[0]->input_tensor_meta(1);
+    attention_mask_ = std::make_unique<TensorImpl>(
+        atten_mask->scalar_type(),
+        atten_mask->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(atten_mask->sizes().data()),
+        ptr->attention_mask,
+        const_cast<TensorImpl::DimOrderType*>(atten_mask->dim_order().data()));
+    input_tensors_[0].push_back(attention_mask_.get());
+    // [I]: pos_ids_cos
+    Result<TensorInfo> pos_ids_cos = methods_meta[0]->input_tensor_meta(2);
+    position_ids_cos_ = std::make_unique<TensorImpl>(
+        pos_ids_cos->scalar_type(),
+        pos_ids_cos->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(pos_ids_cos->sizes().data()),
+        ptr->position_ids_cos,
+        const_cast<TensorImpl::DimOrderType*>(pos_ids_cos->dim_order().data()));
+    input_tensors_[0].push_back(position_ids_cos_.get());
+    // [I]: pos_ids_sin
+    Result<TensorInfo> pos_ids_sin = methods_meta[0]->input_tensor_meta(3);
+    position_ids_sin_ = std::make_unique<TensorImpl>(
+        pos_ids_sin->scalar_type(),
+        pos_ids_sin->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(pos_ids_sin->sizes().data()),
+        ptr->position_ids_sin,
+        const_cast<TensorImpl::DimOrderType*>(pos_ids_sin->dim_order().data()));
+    input_tensors_[0].push_back(position_ids_sin_.get());
+    // [IO]: hidden_state => [I] shard2,3,4
+    int output_index = 8 * 2 * 32; // layres*(k + v caches)*heads
+    Result<TensorInfo> hidden_state =
+        methods_meta[0]->output_tensor_meta(output_index);
+    hidden_state_ = std::make_unique<TensorImpl>(
+        hidden_state->scalar_type(),
+        hidden_state->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(hidden_state->sizes().data()),
+        ptr->hidden_state,
+        const_cast<TensorImpl::DimOrderType*>(
+            hidden_state->dim_order().data()));
+    // reuse inputs for following tensors
+    for (int shard_index = 1; shard_index < 4; ++shard_index) {
+      // inpus of shard1,2,3: hidden_state, atten_mask, pos_ids_cos, pos_ids_sin
+      input_tensors_[shard_index].push_back(hidden_state_.get());
+      input_tensors_[shard_index].push_back(attention_mask_.get());
+      input_tensors_[shard_index].push_back(position_ids_cos_.get());
+      input_tensors_[shard_index].push_back(position_ids_sin_.get());
+    }
+  }
+  // [I] kv_cache for shard1,2,3,4
+  for (int offset = 0, shard_index = 0, v_stride = 1023 * 128; shard_index < 4;
+       offset += 8, shard_index++) {
+    for (int layer = 0; layer < 8; ++layer) {
+      for (int cache_group = 0; cache_group < 2; ++cache_group) {
+        for (int head = 0; head < 32; ++head) {
+          // bypass hidden_state(input_ids), atten_mask, pos_cos, pos_sin
+          int index = 64 * layer + cache_group * 32 + head + 4;
+          Result<TensorInfo> kv_cache =
+              methods_meta[shard_index]->input_tensor_meta(index);
+          std::vector<std::unique_ptr<TensorImpl>>& cache =
+              (cache_group == 0 ? k_cache_in_ : v_cache_in_);
+
+          void* cache_ptr = (cache_group == 0)
+              ? static_cast<void*>(ptr->k_cache[layer + offset][head])
+              : static_cast<void*>(
+                    ptr->v_cache[layer + offset] + head * v_stride);
+
+          cache.emplace_back(std::make_unique<TensorImpl>(
+              kv_cache->scalar_type(),
+              kv_cache->sizes().size(),
+              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+              cache_ptr,
+              const_cast<TensorImpl::DimOrderType*>(
+                  kv_cache->dim_order().data())));
+          input_tensors_[shard_index].push_back(cache.back().get());
+        }
+      }
+    }
+  }
+  // [O] kv_cache for shard1,2,3,4
+  for (int offset = 0, shard_index = 0, v_stride = 1023 * 128; shard_index < 4;
+       offset += 8, shard_index++) {
+    for (int layer = 0; layer < 8; ++layer) {
+      for (int cache_group = 0; cache_group < 2; ++cache_group) {
+        for (int head = 0; head < 32; ++head) {
+          int index = 64 * layer + cache_group * 32 + head;
+          Result<TensorInfo> kv_cache =
+              methods_meta[shard_index]->output_tensor_meta(index);
+          std::vector<std::unique_ptr<TensorImpl>>& cache =
+              (cache_group == 0 ? v_cache_out_ : k_cache_out_);
+
+          void* cache_ptr = (cache_group == 0)
+              ? static_cast<void*>(
+                    ptr->v_cache[layer + offset] + (head + 1) * v_stride)
+              : static_cast<void*>(ptr->k_cache_out[layer + offset][head]);
+
+          cache.emplace_back(std::make_unique<TensorImpl>(
+              kv_cache->scalar_type(),
+              kv_cache->sizes().size(),
+              const_cast<TensorImpl::SizesType*>(kv_cache->sizes().data()),
+              cache_ptr,
+              const_cast<TensorImpl::DimOrderType*>(
+                  kv_cache->dim_order().data())));
+          output_tensors_[shard_index].push_back(cache.back().get());
+        }
+      }
+    }
+  }
+  // [O]: hidden_state for shard1,2,3
+  for (int shard_index = 0; shard_index < 3; ++shard_index) {
+    output_tensors_[shard_index].push_back(hidden_state_.get());
+  }
+  // [O]: logits
+  {
+    int output_index = 8 * 2 * 32; // layres*(k + v caches)*heads
+    Result<TensorInfo> logits =
+        methods_meta[3]->output_tensor_meta(output_index);
+    logits_ = std::make_unique<TensorImpl>(
+        logits->scalar_type(),
+        logits->sizes().size(),
+        const_cast<TensorImpl::SizesType*>(logits->sizes().data()),
+        ptr->logits,
+        const_cast<TensorImpl::DimOrderType*>(logits->dim_order().data()));
+    output_tensors_[3].push_back(logits_.get());
+  }
+  // thread pool jobs
+  for (int i = 0, range = 1024 / thread_pool_.num_workers();
+       i < thread_pool_.num_workers();
+       ++i) {
+    lr_update_kv_.push_back(
+        {.start = i * range, .end = (i + 1) * range, .step = 1});
+  }
+}
+
+void KVCachedMemory::update_io(
+    int64_t cur_token,
+    int64_t pos,
+    std::vector<std::vector<Tensor>>& output_tensors) {
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  int seq_len = 1023;
+  // update input_ids
+  ptr->input_ids = static_cast<int32_t>(cur_token);
+  // update causal mask for next token
+  ptr->attention_mask[seq_len - pos] = 65535;
+  // update position_ids
+  position_ids_cos_->set_data(position_ids_cos_->mutable_data<uint16_t>() + 64);
+  position_ids_sin_->set_data(position_ids_sin_->mutable_data<uint16_t>() + 64);
+
+  auto update_kv = [&](void* arg) {
+    LoopRange* lr = static_cast<LoopRange*>(arg);
+    // update v_cache
+    for (int i = lr->start; i < lr->end; i += lr->step) {
+      v_cache_in_[i]->set_data(v_cache_in_[i]->mutable_data<uint8_t>() + 128);
+      v_cache_out_[i]->set_data(v_cache_out_[i]->mutable_data<uint8_t>() + 128);
+    }
+    // update output tensors of v_cache, 256 is the number of kvs per shard
+    int shard = lr->start >> 8, offset = shard << 8;
+    int start = lr->start - offset, end = lr->end - offset;
+    for (int cache_stride = start; cache_stride < end; cache_stride += 32) {
+      for (int cache_group = 0; cache_group < 2; ++cache_group) {
+        for (int head = 0; head < 32; ++head) {
+          // k, v are placed interleaved
+          int index = (cache_stride << 1) + (cache_group << 5) + head;
+          ET_CHECK_MSG(
+              modules_[shard]->set_output_data_ptr(
+                  output_tensors[shard][index], index) == Error::Ok,
+              "failed to set output tensor for module %d's %d'th output "
+              "while updating kv_cache output tensors",
+              shard,
+              index);
+        }
+      }
+    }
+  };
+  for (int i = 0; i < lr_update_kv_.size(); ++i) {
+    futures_[i] = std::move(thread_pool_.issue(update_kv, &lr_update_kv_[i]));
+  }
+  // update k_cache by single thread, this part is cpu cache sensitive
+  for (int i = 0; i < k_cache_in_.size(); ++i) {
+    uint8_t* ptr_in = k_cache_in_[i]->mutable_data<uint8_t>();
+    const uint8_t* ptr_out = k_cache_out_[i]->data<uint8_t>();
+    for (size_t j = 0, offset = seq_len; j < 128; ++j, offset += seq_len) {
+      ptr_in[offset] = ptr_out[j];
+    }
+    k_cache_in_[i]->set_data(ptr_in + 1);
+  }
+  for (auto& future : futures_) {
+    future.wait();
+  }
+}
+
+ThreadPool::ThreadPool() : stop_(false) {
+  size_t hc = (std::thread::hardware_concurrency() + 3) / 4;
+  // maximum number should be divisible by head dimension which equals to 32
+  num_workers_ = min(32, hc * 4);
+  for (size_t i = 0; i < num_workers_; ++i) {
+    threads_.emplace_back([this]() {
+      while (1) {
+        std::unique_lock<std::mutex> lock(mutex_);
+        cv_.wait(lock, [this] { return !jobs_.empty() || stop_; });
+
+        if (stop_ && jobs_.empty())
+          return;
+
+        JobInfo job_info(std::move(jobs_.front()));
+        jobs_.pop();
+        lock.unlock();
+        job_info.func(job_info.arg);
+      }
+    });
+  }
+}
+
+ThreadPool::~ThreadPool() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  stop_ = true;
+  lock.unlock();
+  cv_.notify_all();
+  for (auto& thread : threads_) {
+    thread.join();
+  }
+}
+
+std::future<void> ThreadPool::issue(
+    std::function<void(void*)> func,
+    void* arg) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  jobs_.push(JobInfo(std::packaged_task<void(void*)>(func), arg));
+  std::future<void> f = std::move(jobs_.back().func.get_future());
+  lock.unlock();
+  cv_.notify_one();
+  return f;
+}
+
+size_t ThreadPool::num_workers() {
+  return num_workers_;
+}
+
+} // namespace executor
+} // namespace torch
diff --git a/examples/qualcomm/llama2/qaihub_runner/io_memory.h b/examples/qualcomm/llama2/qaihub_runner/io_memory.h
new file mode 100644
index 0000000000..2025ec6fa4
--- /dev/null
+++ b/examples/qualcomm/llama2/qaihub_runner/io_memory.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <future>
+#include <memory>
+#include <queue>
+#include <thread>
+#include <vector>
+
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/executor/method_meta.h>
+
+namespace torch {
+namespace executor {
+
+class Memory {
+ public:
+  Memory(
+      const std::vector<std::string>& pos_embs_path,
+      std::vector<std::shared_ptr<Module>>& modules);
+  virtual ~Memory();
+  virtual void prepare_io(
+      const std::vector<Result<MethodMeta>>& methods_meta) = 0;
+  virtual void update_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<Tensor>>& output_tensors) = 0;
+  void* get_mutable_ptr();
+  std::vector<Tensor> get_input_tensors(int shard_index);
+  std::vector<Tensor> get_output_tensors(int shard_index);
+
+ protected:
+  std::unique_ptr<void, void (*)(void*)> data_ptr_;
+  std::vector<std::vector<TensorImpl*>> input_tensors_;
+  std::vector<std::vector<TensorImpl*>> output_tensors_;
+  std::vector<std::string> pos_embs_path_;
+  std::vector<std::shared_ptr<Module>> modules_;
+};
+
+class BertMemory : public Memory {
+ public:
+  BertMemory(
+      const std::vector<std::string>& pos_embs_path,
+      std::vector<std::shared_ptr<Module>>& modules);
+  void prepare_io(const std::vector<Result<MethodMeta>>& methods_meta) override;
+  void update_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<Tensor>>& output_tensors) override;
+  struct IO {
+    int32_t input_ids[1024 * 2];
+    uint16_t hidden_state[1024 * 4096];
+    uint16_t attention_mask[1024 * 1024];
+    uint16_t position_ids_cos[1024 * 64];
+    uint16_t position_ids_sin[1024 * 64];
+    uint8_t k_cache[32][32][128 * 1024];
+    uint8_t v_cache[32][32][1024 * 128];
+    uint16_t logits[32000];
+  };
+
+ private:
+  std::unique_ptr<TensorImpl> input_ids_;
+  std::unique_ptr<TensorImpl> hidden_state_;
+  std::unique_ptr<TensorImpl> attention_mask_;
+  std::unique_ptr<TensorImpl> position_ids_cos_;
+  std::unique_ptr<TensorImpl> position_ids_sin_;
+  std::vector<std::unique_ptr<TensorImpl>> k_cache_;
+  std::vector<std::unique_ptr<TensorImpl>> v_cache_;
+  std::unique_ptr<TensorImpl> logits_;
+};
+
+class ThreadPool {
+ public:
+  ThreadPool();
+  ~ThreadPool();
+
+  std::future<void> issue(std::function<void(void*)> func, void* arg);
+  size_t num_workers();
+
+ private:
+  struct JobInfo {
+    explicit JobInfo(std::packaged_task<void(void*)>&& func, void* arg)
+        : func(std::move(func)), arg(arg) {}
+    explicit JobInfo(JobInfo&& job_info)
+        : func(std::move(job_info.func)), arg(job_info.arg) {}
+    std::packaged_task<void(void*)> func;
+    void* arg;
+  };
+  size_t num_workers_;
+  std::vector<std::thread> threads_;
+  std::queue<JobInfo> jobs_;
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  bool stop_;
+};
+
+class KVCachedMemory : public Memory {
+ public:
+  KVCachedMemory(
+      const std::vector<std::string>& pos_embs_path,
+      std::vector<std::shared_ptr<Module>>& modules);
+  void prepare_io(const std::vector<Result<MethodMeta>>& methods_meta) override;
+  void update_io(
+      int64_t cur_token,
+      int64_t pos,
+      std::vector<std::vector<Tensor>>& output_tensors) override;
+  struct IO {
+    int32_t input_ids;
+    uint16_t hidden_state[4096];
+    uint16_t attention_mask[1024];
+    uint16_t position_ids_cos[1024 * 64];
+    uint16_t position_ids_sin[1024 * 64];
+    uint8_t k_cache[32][32][129 * 1023];
+    uint8_t v_cache[32][33 * 1023 * 128];
+    uint8_t k_cache_out[32][32][128];
+    uint16_t logits[32000];
+  };
+  struct LoopRange {
+    int32_t start;
+    int32_t end;
+    int32_t step;
+  };
+
+ private:
+  std::unique_ptr<TensorImpl> input_ids_;
+  std::unique_ptr<TensorImpl> hidden_state_;
+  std::unique_ptr<TensorImpl> attention_mask_;
+  std::unique_ptr<TensorImpl> position_ids_cos_;
+  std::unique_ptr<TensorImpl> position_ids_sin_;
+  std::vector<std::unique_ptr<TensorImpl>> k_cache_in_;
+  std::vector<std::unique_ptr<TensorImpl>> v_cache_in_;
+  std::vector<std::unique_ptr<TensorImpl>> k_cache_out_;
+  std::vector<std::unique_ptr<TensorImpl>> v_cache_out_;
+  std::unique_ptr<TensorImpl> logits_;
+  std::vector<LoopRange> lr_update_kv_;
+  std::vector<std::future<void>> futures_;
+  ThreadPool thread_pool_;
+};
+
+} // namespace executor
+} // namespace torch
diff --git a/examples/qualcomm/llama2/qaihub_runner/runner.cpp b/examples/qualcomm/llama2/qaihub_runner/runner.cpp
new file mode 100644
index 0000000000..a905ecd3e7
--- /dev/null
+++ b/examples/qualcomm/llama2/qaihub_runner/runner.cpp
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple llama2 runner that includes preprocessing and post processing logic.
+// The module takes in a string as input and emits a string as output.
+
+#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
+#include <executorch/examples/qualcomm/llama2/qaihub_runner/runner.h>
+#include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/runner_util/managed_tensor.h>
+
+#include <ctime>
+#include <memory>
+#include <sstream>
+
+#include <executorch/examples/models/llama2/runner/util.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/log.h>
+
+#include "arm_neon.h"
+
+namespace torch {
+namespace executor {
+
+namespace {
+static constexpr auto kTopp = 0.9f;
+void printReport(const Runner::Stats& stats);
+std::string statsToJsonString(const Runner::Stats& stats);
+} // namespace
+
+Runner::Runner(
+    const std::vector<std::string>& models_path,
+    const std::vector<std::string>& pos_embs_path,
+    const std::string& tokenizer_path,
+    const int eval_mode,
+    const float temperature,
+    const float logits_scale,
+    const int logits_offset)
+    : tokenizer_path_(tokenizer_path),
+      temperature_(temperature),
+      bos_id_(1),
+      eos_id_(2),
+      n_bos_(1),
+      n_eos_(1),
+      vocab_size_(32000),
+      max_seq_len_(1024),
+      eval_mode_(eval_mode),
+      stats_({}),
+      logits_scale_(logits_scale),
+      logits_offset_(logits_offset) {
+  for (size_t i = 0; i < models_path.size(); ++i) {
+    modules_.push_back(std::make_shared<Module>(
+        models_path[i], Module::LoadMode::MmapUseMlockIgnoreErrors));
+    ET_LOG(Info, "creating module: model_path=%s", models_path[i].c_str());
+  }
+  ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str());
+  switch (eval_mode_) {
+    case EvalMode::kBert:
+      io_mem_ = std::make_unique<BertMemory>(pos_embs_path, modules_);
+      break;
+    case EvalMode::kKVCached:
+      io_mem_ = std::make_unique<KVCachedMemory>(pos_embs_path, modules_);
+      break;
+    default:
+      ET_CHECK_MSG(false, "unsupported evaluation mode");
+  }
+  ET_LOG(Info, "creating io_memory");
+}
+
+bool Runner::is_loaded() const {
+  bool loaded = true;
+  for (const std::shared_ptr<Module>& module : modules_) {
+    loaded &= module->is_loaded();
+  }
+  return loaded && tokenizer_ && sampler_;
+}
+
+Error Runner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+  for (std::shared_ptr<Module>& module : modules_) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module->load_method("forward"));
+  }
+
+  // load tokenizer
+  tokenizer_ = std::make_unique<BPETokenizer>();
+  tokenizer_->load(tokenizer_path_);
+
+  // create sampler
+  sampler_ = std::make_unique<Sampler>(
+      vocab_size_,
+      temperature_,
+      kTopp,
+      static_cast<unsigned long long>(std::time(nullptr)));
+
+  // prepare io
+  auto methods_meta = get_methods_meta();
+  io_mem_->prepare_io(methods_meta);
+  return Error::Ok;
+}
+
+int32_t Runner::logitsToToken(const Tensor& logits_tensor) {
+  static std::vector<float> logits_f(vocab_size_);
+  static int32x4_t offset = vmovq_n_s32(logits_offset_);
+  static float32x4_t scale = vmovq_n_f32(logits_scale_);
+  const uint16_t* logits = logits_tensor.data_ptr<uint16_t>();
+  // dequantize
+  for (int i = 0; i < vocab_size_; i += 4) {
+    const uint16_t* in = logits + i;
+    float* out = logits_f.data() + i;
+    int32_t data[4] = {in[0], in[1], in[2], in[3]};
+    int32x4_t quantized = vld1q_s32(data);
+    int32x4_t shifted = vsubq_s32(quantized, offset);
+    float32x4_t shifted_f = vcvtq_f32_s32(shifted);
+    vst1q_f32(out, vmulq_f32(shifted_f, scale));
+  }
+  return sampler_->sample(logits_f.data());
+}
+
+void Runner::run_model_step(std::vector<std::vector<EValue>>& inputs) {
+  for (size_t i = 0, num_modules = modules_.size(); i < num_modules; ++i) {
+    Result<std::vector<EValue>> outputs_res = modules_[i]->forward(inputs[i]);
+    ET_CHECK_MSG(
+        outputs_res.error() == Error::Ok, "shard %zu inference failed", i);
+  }
+}
+
+// TODO: add overloaded method for on-device tokenize
+Error Runner::generate(
+    const std::string& prompt,
+    int32_t seq_len,
+    std::function<void(const std::string&)> token_callback,
+    std::function<void(const Stats&)> stats_callback) {
+  ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null");
+
+  std::vector<std::vector<Tensor>> input_tensors, output_tensors;
+  std::vector<std::vector<EValue>> inputs;
+  if (!is_loaded()) {
+    stats_.model_load_start_ms = util::time_in_ms();
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+    for (int i = 0; i < 4; ++i) {
+      input_tensors.emplace_back(io_mem_->get_input_tensors(i));
+      output_tensors.emplace_back(io_mem_->get_output_tensors(i));
+      for (size_t j = 0; j < output_tensors[i].size(); ++j) {
+        ET_CHECK_MSG(
+            modules_[i]->set_output_data_ptr(output_tensors[i][j], j) ==
+                Error::Ok,
+            "failed to set output tensor for module %d's %zu'th output",
+            i,
+            j);
+      }
+      inputs.emplace_back(
+          std::vector<EValue>(begin(input_tensors[i]), end(input_tensors[i])));
+    }
+    stats_.model_load_end_ms = util::time_in_ms();
+  }
+
+  stats_.inference_start_ms = util::time_in_ms();
+  shouldStop_ = false;
+  seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_;
+
+  Result<std::vector<uint64_t>> encode_res =
+      tokenizer_->encode(prompt, n_bos_, 0);
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      encode_res.error(), "failed to encode prompt %s", prompt.c_str());
+
+  std::vector<uint64_t> prompt_tokens = encode_res.get();
+  int num_prompt_tokens = prompt_tokens.size();
+  ET_CHECK_MSG(num_prompt_tokens < max_seq_len_, "max seq length exceeded");
+  ET_CHECK_MSG(
+      num_prompt_tokens < seq_len,
+      "sequence length exceeded - please increase the seq_len value");
+
+  int64_t pos = 0, prev_token, cur_token = prompt_tokens[0];
+  if (eval_mode_ == EvalMode::kBert) {
+    BertMemory::IO* ptr =
+        static_cast<BertMemory::IO*>(io_mem_->get_mutable_ptr());
+    int start_index = max_seq_len_ - num_prompt_tokens;
+    // indices are filled from behind, take 3 tokens as an example:
+    // > tokens : [...tok_pad, tok_bos, tok1, tok2]
+    // > indices: [0.....1020, 1021,    1022, 1023]
+    for (int i = 0; i < num_prompt_tokens; i++) {
+      ptr->input_ids[start_index + i] = static_cast<int32_t>(prompt_tokens[i]);
+    }
+    // causal attention mask is filled as following:
+    // 0, 65535 maps to -100.0, 0.0 after dequantizing
+    // 0      : [0,...................0,     0,     0,     0]
+    // 1-1019 : ...
+    // 1020   : [0,...............65535,     0,     0,     0]
+    // 1021   : [0,...............65535, 65535,     0,     0]
+    // 1022   : [0,...............65535, 65535, 65535,     0]
+    // 1023   : [0,...............65535, 65535, 65535, 65535]
+    for (int i = max_seq_len_ - 1, len = num_prompt_tokens; len >= 0;
+         --i, --len) {
+      for (int j = 0; j <= len; ++j) {
+        ptr->attention_mask[i * max_seq_len_ + start_index - 1 + j] = 65535;
+      }
+    }
+    pos = num_prompt_tokens - 1;
+    cur_token = prompt_tokens[pos];
+  } else if (eval_mode_ == EvalMode::kKVCached) {
+    KVCachedMemory::IO* ptr =
+        static_cast<KVCachedMemory::IO*>(io_mem_->get_mutable_ptr());
+    ptr->input_ids = static_cast<int32_t>(cur_token);
+    ptr->attention_mask[max_seq_len_ - 1] = 65535;
+  }
+
+  while (pos < seq_len - 1) {
+    run_model_step(inputs);
+    Tensor& logits_tensor = output_tensors.back().back();
+
+    if (pos == num_prompt_tokens) {
+      stats_.first_token_ms = util::time_in_ms();
+    } else if (pos == num_prompt_tokens - 1) {
+      stats_.prompt_eval_end_ms = util::time_in_ms();
+    }
+
+    long sample_start_time_ms = util::time_in_ms();
+    prev_token = cur_token;
+    cur_token = logitsToToken(logits_tensor);
+    stats_.aggregate_sampling_time_ms +=
+        util::time_in_ms() - sample_start_time_ms;
+
+    if (pos < num_prompt_tokens - 1) {
+      cur_token = prompt_tokens[pos + 1];
+    }
+    io_mem_->update_io(cur_token, ++pos, output_tensors);
+
+    auto piece_res = tokenizer_->decode(prev_token, cur_token);
+    ET_CHECK(piece_res.ok());
+
+    if (token_callback) {
+      token_callback(piece_res.get().c_str());
+    }
+
+    if (shouldStop_) {
+      break;
+    }
+
+    if (pos >= num_prompt_tokens && cur_token == eos_id_) {
+      ET_LOG(Info, "\nReached to the end of generation");
+      break;
+    }
+  }
+  stats_.inference_end_ms = util::time_in_ms();
+
+  if (pos == seq_len) {
+    ET_LOG(Info, "\nSequence length (%i tokens) reached!", seq_len);
+  }
+
+  stats_.num_prompt_tokens = num_prompt_tokens;
+  stats_.num_generated_tokens = pos - num_prompt_tokens;
+  printReport(stats_);
+  if (stats_callback) {
+    stats_callback(stats_);
+  }
+
+  return Error::Ok;
+}
+
+namespace {
+void printReport(const Runner::Stats& stats) {
+  printf("PyTorchObserver %s\n", statsToJsonString(stats).c_str());
+
+  ET_LOG(
+      Info,
+      "\tPrompt Tokens: %" PRIu64 "    Generated Tokens: %" PRIu64,
+      stats.num_prompt_tokens,
+      stats.num_generated_tokens);
+
+  ET_LOG(
+      Info,
+      "\tModel Load Time:\t\t%f (seconds)",
+      ((double)(stats.model_load_end_ms - stats.model_load_start_ms) /
+       stats.SCALING_FACTOR_UNITS_PER_SECOND));
+  double inference_time_ms =
+      (double)(stats.inference_end_ms - stats.inference_start_ms);
+  ET_LOG(
+      Info,
+      "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND,
+
+      (stats.num_generated_tokens) /
+          (double)(stats.inference_end_ms - stats.inference_start_ms) *
+          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+  double prompt_eval_time =
+      (double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
+  ET_LOG(
+      Info,
+      "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
+      (stats.num_prompt_tokens) / prompt_eval_time *
+          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+
+  double eval_time =
+      (double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
+  ET_LOG(
+      Info,
+      "\t\tGenerated %" PRIu64
+      " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
+      stats.num_generated_tokens,
+      eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
+      stats.num_generated_tokens / eval_time *
+          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+
+  // Time to first token is measured from the start of inference, excluding
+  // model load time.
+  ET_LOG(
+      Info,
+      "\tTime to first generated token:\t%f (seconds)",
+      ((double)(stats.first_token_ms - stats.inference_start_ms) /
+       stats.SCALING_FACTOR_UNITS_PER_SECOND));
+
+  ET_LOG(
+      Info,
+      "\tSampling time over %" PRIu64 " tokens:\t%f (seconds)",
+      stats.num_prompt_tokens + stats.num_generated_tokens,
+      (double)stats.aggregate_sampling_time_ms /
+          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+}
+
+std::string statsToJsonString(const Runner::Stats& stats) {
+  std::stringstream ss;
+  ss << "{\"prompt_tokens\":" << stats.num_prompt_tokens << ","
+     << "\"generated_tokens\":" << stats.num_generated_tokens << ","
+     << "\"model_load_start_ms\":" << stats.model_load_start_ms << ","
+     << "\"model_load_end_ms\":" << stats.model_load_end_ms << ","
+     << "\"inference_start_ms\":" << stats.inference_start_ms << ","
+     << "\"inference_end_ms\":" << stats.inference_end_ms << ","
+     << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << ","
+     << "\"first_token_ms\":" << stats.first_token_ms << ","
+     << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms
+     << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
+     << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}";
+  return ss.str();
+}
+} // namespace
+
+void Runner::stop() {
+  shouldStop_ = true;
+}
+
+std::vector<Result<MethodMeta>> Runner::get_methods_meta() {
+  std::vector<Result<MethodMeta>> methods_meta;
+  methods_meta.reserve(modules_.size());
+  for (std::shared_ptr<Module>& module : modules_) {
+    methods_meta.emplace_back(module->method_meta("forward"));
+  }
+  return methods_meta;
+}
+} // namespace executor
+} // namespace torch
diff --git a/examples/qualcomm/llama2/qaihub_runner/runner.h b/examples/qualcomm/llama2/qaihub_runner/runner.h
new file mode 100644
index 0000000000..4012c39629
--- /dev/null
+++ b/examples/qualcomm/llama2/qaihub_runner/runner.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple llama2 runner that includes preprocessing and post processing logic.
+// The module takes in a string as input and emits a string as output.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include <executorch/examples/models/llama2/sampler/sampler.h>
+#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <executorch/examples/qualcomm/llama2/qaihub_runner/io_memory.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/runner_util/managed_tensor.h>
+
+namespace torch {
+namespace executor {
+
+class Runner {
+ public:
+  explicit Runner(
+      const std::vector<std::string>& models_path,
+      const std::vector<std::string>& pos_embs_path,
+      const std::string& tokenizer_path,
+      const int eval_mode,
+      const float temperature,
+      const float logits_scale,
+      const int logits_offset);
+
+  struct Stats {
+    // Scaling factor for timestamps - in this case, we use ms.
+    const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
+    // Time stamps for the different stages of the execution
+    // model_load_start_ms: Start of model loading.
+    long model_load_start_ms;
+    // model_load_end_ms: End of model loading.
+    long model_load_end_ms;
+    // inference_start_ms: Immediately after the model is loaded (or we check
+    // for model load), measure the inference time.
+    long inference_start_ms;
+    // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
+    // before the inference loop starts
+    long prompt_eval_end_ms;
+    // first_token: Timestamp when the first generated token is emitted
+    long first_token_ms;
+    // inference_end_ms: End of inference/generation.
+    long inference_end_ms;
+    // Keep a running total of the time spent in sampling.
+    long aggregate_sampling_time_ms;
+    // Token count from prompt
+    int64_t num_prompt_tokens;
+    // Token count from generated (total - prompt)
+    int64_t num_generated_tokens;
+  };
+
+  bool is_loaded() const;
+  Error load();
+  Error generate(
+      const std::string& prompt,
+      int32_t seq_len,
+      std::function<void(const std::string&)> token_callback = {},
+      std::function<void(const Stats&)> stats_callback = {});
+  void stop();
+  std::vector<Result<MethodMeta>> get_methods_meta();
+
+ private:
+  enum EvalMode {
+    kBert = 0,
+    kKVCached,
+    kUnsupported,
+  };
+
+  int32_t logitsToToken(const exec_aten::Tensor& logits_tensor);
+  void run_model_step(std::vector<std::vector<EValue>>& inputs);
+  // metadata
+  const int32_t bos_id_;
+  const int32_t eos_id_;
+  const int32_t n_bos_;
+  const int32_t n_eos_;
+  const int32_t vocab_size_;
+  const int32_t max_seq_len_;
+  int32_t eval_mode_;
+  std::vector<std::shared_ptr<Module>> modules_;
+  std::string tokenizer_path_;
+  float temperature_;
+  std::unique_ptr<Tokenizer> tokenizer_;
+  std::unique_ptr<Sampler> sampler_;
+  bool shouldStop_{false};
+  Stats stats_;
+  std::unique_ptr<Memory> io_mem_;
+  const float logits_scale_;
+  const int32_t logits_offset_;
+};
+
+} // namespace executor
+} // namespace torch
diff --git a/examples/qualcomm/llama2/runner/runner.cpp b/examples/qualcomm/llama2/runner/runner.cpp
index d68f9dfea4..009bb6b209 100644
--- a/examples/qualcomm/llama2/runner/runner.cpp
+++ b/examples/qualcomm/llama2/runner/runner.cpp
@@ -80,14 +80,14 @@ Error Runner::load() {
   if (tokenizer_->bos_tok() != bos_id_) {
     ET_LOG(
         Error,
-        "Tokenizer's BOS id %lu does not match model's BOS id %d, will override tokenizer's BOS.",
+        "Tokenizer's BOS id %lu does not match model's BOS id %ld, will override tokenizer's BOS.",
         tokenizer_->bos_tok(),
         bos_id_);
   }
   if (tokenizer_->eos_tok() != eos_id_) {
     ET_LOG(
         Error,
-        "Tokenizer's EOS id %lu does not match model's EOS id %d, will override tokenizer's EOS.",
+        "Tokenizer's EOS id %lu does not match model's EOS id %ld, will override tokenizer's EOS.",
         tokenizer_->eos_tok(),
         eos_id_);
   }
diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/scripts/utils.py
index 274b4b39b7..6093f1dc1c 100755
--- a/examples/qualcomm/scripts/utils.py
+++ b/examples/qualcomm/scripts/utils.py
@@ -54,11 +54,11 @@ def __init__(
     ):
         self.qnn_sdk = qnn_sdk
         self.build_path = build_path
-        self.pte_path = pte_path
+        self.pte_path = pte_path if isinstance(pte_path, list) else [pte_path]
         self.workspace = workspace
         self.device_id = device_id
         self.host_id = host_id
-        self.working_dir = Path(self.pte_path).parent.absolute()
+        self.working_dir = Path(self.pte_path[0]).parent.absolute()
         self.input_list_filename = "input_list.txt"
         self.etdump_path = f"{self.workspace}/etdump.etdp"
         self.output_folder = f"{self.workspace}/outputs"
@@ -84,19 +84,13 @@ def _adb(self, cmd):
             cmds, stdout=subprocess.DEVNULL if self.error_only else sys.stdout
         )
 
-    def push(self, inputs, input_list, files=None):
+    def push(self, inputs=None, input_list=None, files=None):
         self._adb(["shell", f"rm -rf {self.workspace}"])
         self._adb(["shell", f"mkdir -p {self.workspace}"])
 
-        # prepare input list
-        input_list_file = f"{self.working_dir}/{self.input_list_filename}"
-        with open(input_list_file, "w") as f:
-            f.write(input_list)
-            f.flush()
-
         # necessary artifacts
-        for artifact in [
-            f"{self.pte_path}",
+        artifacts = [
+            *self.pte_path,
             f"{self.qnn_sdk}/lib/aarch64-android/libQnnHtp.so",
             (
                 f"{self.qnn_sdk}/lib/hexagon-v{self.soc_model}/"
@@ -110,21 +104,31 @@ def push(self, inputs, input_list, files=None):
             f"{self.qnn_sdk}/lib/aarch64-android/libQnnSystem.so",
             f"{self.build_path}/{self.runner}",
             f"{self.build_path}/backends/qualcomm/libqnn_executorch_backend.so",
-            input_list_file,
-        ]:
+        ]
+
+        # prepare input list
+        if input_list is not None:
+            input_list_file = f"{self.working_dir}/{self.input_list_filename}"
+            with open(input_list_file, "w") as f:
+                f.write(input_list)
+                f.flush()
+            artifacts.append(input_list_file)
+
+        for artifact in artifacts:
             self._adb(["push", artifact, self.workspace])
 
         # input data
-        for idx, data in enumerate(inputs):
-            for i, d in enumerate(data):
-                file_name = f"{self.working_dir}/input_{idx}_{i}.raw"
-                d.detach().numpy().tofile(file_name)
-                self._adb(["push", file_name, self.workspace])
-
-        # extra files
+        if inputs is not None:
+            for idx, data in enumerate(inputs):
+                for i, d in enumerate(data):
+                    file_name = f"{self.working_dir}/input_{idx}_{i}.raw"
+                    d.detach().numpy().tofile(file_name)
+                    self._adb(["push", file_name, self.workspace])
+
+        # custom files
         if files is not None:
-            for f in files:
-                self._adb(["push", f, self.workspace])
+            for file_name in files:
+                self._adb(["push", file_name, self.workspace])
 
     def execute(self, custom_runner_cmd=None):
         self._adb(["shell", f"mkdir -p {self.output_folder}"])
@@ -132,7 +136,7 @@ def execute(self, custom_runner_cmd=None):
         if custom_runner_cmd is None:
             qnn_executor_runner_args = " ".join(
                 [
-                    f"--model_path {os.path.basename(self.pte_path)}",
+                    f"--model_path {os.path.basename(self.pte_path[0])}",
                     f"--output_folder_path {self.output_folder}",
                     f"--input_list_path {self.input_list_filename}",
                     f"--etdump_path {self.etdump_path}",
@@ -153,12 +157,12 @@ def execute(self, custom_runner_cmd=None):
         self._adb(["shell", f"{qnn_executor_runner_cmds}"])
 
     def pull(self, output_path, callback=None):
-        self._adb(["pull", "-a", f"{self.output_folder}", output_path])
+        self._adb(["pull", "-a", self.output_folder, output_path])
         if callback:
             callback()
 
     def pull_etdump(self, output_path, callback=None):
-        self._adb(["pull", f"{self.etdump_path}", output_path])
+        self._adb(["pull", self.etdump_path, output_path])
         if callback:
             callback()