pytorch · haowhsu-quic · Jul 9, 2024 · Jul 18, 2024 · Jul 19, 2024 · cccclai
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
@@ -63,9 +63,6 @@ endif()
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
   # strip symbols
   add_link_options("-s")
-  # hide dynamic symbols
-  set(CMAKE_C_VISIBILITY_PRESET hidden)
-  set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 
   # --gc-sections is added by torch.
   add_compile_options(

diff --git a/backends/qualcomm/builders/op_prelu.py b/backends/qualcomm/builders/op_prelu.py
@@ -43,6 +43,10 @@ def define_node(
             coeff_node = node.args[1]
             coeff_tensor = torch.zeros(input_node.meta["val"].shape)
             coeff = get_parameter(coeff_node, self.edge_program)
+            # param nodes will be FakeTensor when doing partition
+            # fill in random numeric for validation
+            if isinstance(coeff, torch._subclasses.fake_tensor.FakeTensor):
+                coeff = torch.ones(coeff.shape)
             # per-channel activation
             if coeff_node.meta["val"].shape[0] > 1:
                 for i in range(input_node.meta["val"].shape[1]):

diff --git a/backends/qualcomm/builders/utils.py b/backends/qualcomm/builders/utils.py
@@ -7,7 +7,14 @@
 from typing import Dict, Optional
 
 import torch
-from torch._export.utils import get_buffer, get_param, is_buffer, is_param
+from torch._export.utils import (
+    get_buffer,
+    get_lifted_tensor_constant,
+    get_param,
+    is_buffer,
+    is_lifted_tensor_constant,
+    is_param,
+)
 
 
 def is_parameter(
@@ -16,7 +23,7 @@ def is_parameter(
     return (
         is_param(edge_program, node)
         or is_buffer(edge_program, node)
-        or node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants
+        or is_lifted_tensor_constant(edge_program, node)
     )
 
 
@@ -28,9 +35,8 @@ def get_parameter(
         param = get_param(edge_program, node)
     if is_buffer(edge_program, node):
         param = get_buffer(edge_program, node)
-    if node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants:
-        name = edge_program.graph_signature.inputs_to_lifted_tensor_constants[node.name]
-        param = edge_program.constants[name]
+    if is_lifted_tensor_constant(edge_program, node):
+        param = get_lifted_tensor_constant(edge_program, node)
     if param is not None:
         # update node.meta["val"] to qualified QNN datatype (e.g. i64 to i32)
         assert isinstance(param, torch.Tensor), "Expect parameter to be tensor"

diff --git a/backends/qualcomm/passes/recompose_pixel_unshuffle.py b/backends/qualcomm/passes/recompose_pixel_unshuffle.py
@@ -24,7 +24,7 @@ def __init__(self, quantization_capture=False):
 
         self.quantization_capture = quantization_capture
         if quantization_capture:
-            self.reshape_target = torch.ops.aten.reshape.default
+            self.reshape_target = torch.ops.aten._unsafe_view.default
             self.permute_target = torch.ops.aten.permute.default
             self.view_target = torch.ops.aten.view.default
             self.op = torch.ops.aten.pixel_unshuffle.default

diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
@@ -29,6 +29,7 @@ CMAKE_X86_64="build_x86_64"
 BUILD_AARCH64="true"
 CMAKE_AARCH64="build_android"
 CLEAN="true"
+BUILD_TYPE="Debug"
 
 if [ -z PYTHON_EXECUTABLE ]; then
   PYTHON_EXECUTABLE="python3"
@@ -38,7 +39,7 @@ if [ -z BUCK2 ]; then
   BUCK2="buck2"
 fi
 
-long_options=skip_x86_64,skip_aarch64,no_clean
+long_options=skip_x86_64,skip_aarch64,no_clean,release
 
 parsed_args=$(getopt -a --options '' --longoptions $long_options --name "$0" -- "$@")
 eval set -- "$parsed_args"
@@ -49,6 +50,7 @@ while true ; do
         --skip_x86_64) BUILD_X86_64="false"; shift;;
         --skip_aarch64) BUILD_AARCH64="false"; shift;;
         --no_clean) CLEAN="false"; shift;;
+        --release) BUILD_TYPE="Release"; shift;;
         --) shift; break;;
     esac
 done
@@ -66,9 +68,9 @@ if [ "$BUILD_AARCH64" = true ]; then
     fi
 
     cd $BUILD_ROOT
-    # If we build debug type, we need to change flatcc to flatcc_d
     cmake .. \
         -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \
+        -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
         -DEXECUTORCH_BUILD_QNN=ON \
         -DEXECUTORCH_BUILD_SDK=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
@@ -87,6 +89,7 @@ if [ "$BUILD_AARCH64" = true ]; then
 
     cmake $PRJ_ROOT/$EXAMPLE_ROOT \
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
+        -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
         -DANDROID_ABI='arm64-v8a' \
         -DANDROID_NATIVE_API_LEVEL=23 \
         -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \

diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
@@ -302,7 +302,7 @@ def get_qdq_module(
         custom_quant_annotations: Tuple[Callable] = (),
         quant_dtype: QuantDtype = QuantDtype.use_8a8w,
     ) -> torch.fx.GraphModule:
-        m = torch._export.capture_pre_autograd_graph(module, inputs)
+        m = torch.export.export(module, inputs).module()
 
         quantizer = QnnQuantizer()
         quantizer.add_custom_quant_annotations(custom_quant_annotations)

diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -58,6 +58,7 @@
 from executorch.exir import ExirExportedProgram
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.lowered_backend_module import LoweredBackendModule
+from executorch.exir.program._program import _get_updated_graph_signature
 from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions
 from torch.export.exported_program import ExportedProgram
 from torch.fx import passes
@@ -223,7 +224,12 @@ def capture_program(
     core_ep.transform(ConvertBinaryOpsWithScalar())
     edge_ep = core_ep.to_edge(qnn_edge_config())
     _transform(edge_ep.exported_program)
-
+    # Since QDQ nodes are stripped, update graph signature again to validate program
+    edge_ep.exported_program._graph_signature = _get_updated_graph_signature(
+        edge_ep.exported_program.graph_signature,
+        edge_ep.exported_program.graph_module,
+    )
+    edge_ep.exported_program._validate()
     return edge_ep
 
 

diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt
@@ -106,7 +106,9 @@ target_link_libraries(
   qnn_executor_runner qnn_executorch_backend full_portable_ops_lib etdump
   ${FLATCCRT_LIB} gflags
 )
-target_link_options(qnn_executor_runner PUBLIC -fsanitize=undefined)
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options(qnn_executor_runner PUBLIC -fsanitize=undefined)
+endif()
 
 # build llama runner
 add_executable(qnn_llama_runner ${_qnn_llama_runner__srcs})

diff --git a/examples/qualcomm/llama2/llama.py b/examples/qualcomm/llama2/llama.py
@@ -8,8 +8,6 @@
 import getpass
 import json
 import os
-import shutil
-import stat
 import time
 from multiprocessing.connection import Client
 
@@ -62,7 +60,6 @@ def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None:
     """
     This function is specific for matmul op 16a8w.
     """
-    from typing import Sequence
 
     from executorch.backends.qualcomm.quantizer.quantizer import (
         get_16a8w_qnn_ptq_config,
@@ -294,9 +291,9 @@ def quantize(self, quant_dtype, custom_annotations=()):
         fx_graph_module = None
 
         with torch.no_grad():
-            fx_graph_module = torch._export.capture_pre_autograd_graph(
+            fx_graph_module = torch.export.export(
                 self.llama_model, self.inputs
-            )
+            ).module()
             fx_graph_module = prepare_pt2e(fx_graph_module, quantizer)
         print("Quantizing the model...")
         calibrate(
@@ -343,16 +340,6 @@ def lowering_modules(
                 constant_methods=self.llama_meta,
                 compile_config=EdgeCompileConfig(_check_ir_validity=False),
             )
-
-            setattr(
-                edge_prog_mgr.exported_program(),
-                "_graph_signature",
-                _get_updated_graph_signature(
-                    edge_prog_mgr.exported_program().graph_signature,
-                    edge_prog_mgr.exported_program().graph_module,
-                ),
-            )
-
             edge_prog_mgr = edge_prog_mgr.to_backend(partitioner)
             exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config)
             with open(f"{work_space}/{pte_filename}.pte", "wb") as file:
@@ -520,7 +507,6 @@ def post_process():
         "-P",
         "--ptq",
         help="If specified, will do PTQ quantization. default is 16bits activation and 4bits weight. Support 8a8w and 16a4w.",
-        required=True,
         default="16a4w",
     )
 

diff --git a/examples/qualcomm/llama2/qaihub_runner/runner.cpp b/examples/qualcomm/llama2/qaihub_runner/runner.cpp
@@ -9,9 +9,9 @@
 // A simple llama2 runner that includes preprocessing and post processing logic.
 // The module takes in a string as input and emits a string as output.
 
-#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
 #include <executorch/examples/qualcomm/llama2/qaihub_runner/runner.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
 #include <ctime>

diff --git a/examples/qualcomm/llama2/qaihub_runner/runner.h b/examples/qualcomm/llama2/qaihub_runner/runner.h
@@ -18,8 +18,8 @@
 #include <unordered_map>
 
 #include <executorch/examples/models/llama2/sampler/sampler.h>
-#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
 #include <executorch/examples/qualcomm/llama2/qaihub_runner/io_memory.h>
+#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 

diff --git a/examples/qualcomm/llama2/runner/runner.h b/examples/qualcomm/llama2/runner/runner.h
@@ -19,7 +19,7 @@
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
 #include <executorch/examples/models/llama2/sampler/sampler.h>
-#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 

diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
@@ -66,7 +66,7 @@
     quantizer.set_bit8_op_quant_config(quant_config)
 
     # Typical pytorch 2.0 quantization flow
-    m = torch._export.capture_pre_autograd_graph(model.eval(), example_inputs)
+    m = torch.export.export(model.eval(), example_inputs).module()
     m = prepare_pt2e(m, quantizer)
     # Calibration
     m(*example_inputs)

diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/scripts/utils.py
@@ -204,7 +204,7 @@ def build_executorch_binary(
         else:
             raise AssertionError(f"No support for QuantDtype {quant_dtype}.")
 
-        captured_model = torch._export.capture_pre_autograd_graph(model, inputs)
+        captured_model = torch.export.export(model, inputs).module()
         annotated_model = prepare_pt2e(captured_model, quantizer)
         print("Quantizing the model...")
         # calibration

diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
@@ -352,14 +352,7 @@ def to_backend(
         ExportedProgram: The input program, with some portions targeted for delegation.
     """
     # Use fake program, with FakeTensors in the state dict, to avoid copying large constant values.
-    # Fall back to deepcopy if no fake mode is found. TODO(T182910699): Remove this fallback.
-    try:
-        fake_edge_program = get_fake_program(edge_program)
-    except Exception as e:
-        logging.warning(
-            f"Error in get_fake_program for graph {edge_program.graph_module}, fallback to deepcopy: {e}"
-        )
-        fake_edge_program = copy.deepcopy(edge_program)
+    fake_edge_program = get_fake_program(edge_program)
     partitioner_result = partitioner_instance(fake_edge_program)
     tagged_exported_program = partitioner_result.tagged_exported_program
 

diff --git a/sdk/CMakeLists.txt b/sdk/CMakeLists.txt
@@ -84,25 +84,16 @@ option(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT
 )
 
 if(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT)
-  # Add the host project. We build this separately so that we can generate
-  # headers on the host during the build, even if we're cross-compiling the
-  # flatcc runtime to a different architecture.
-
-  # lint_cmake: -readability/wonkycase
-  ExternalProject_Add(
-    flatcc_project
-    PREFIX ${CMAKE_BINARY_DIR}/_host_build
-    SOURCE_DIR ${_flatcc_source_dir}
-    BINARY_DIR ${CMAKE_BINARY_DIR}/_host_build
-    CMAKE_CACHE_ARGS
-      -DFLATCC_TEST:BOOL=OFF -DFLATCC_REFLECTION:BOOL=OFF
-      # See above comment about POSITION_INDEPENDENT_CODE.
-      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-    INSTALL_COMMAND "" # Prevent the install step
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} ${_flatcc_source_dir}
+    -DFLATCC_TEST=OFF -DFLATCC_REFLECTION=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    -B${CMAKE_BINARY_DIR}/_host_build
   )
-  set(_etdump_schema_gen_dep flatcc_project)
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR}/_host_build
+  )
+  set(_etdump_schema_gen_dep)
 else()
-  # If we're not cross-compiling, we can just use the plain commandline target.
   set(_etdump_schema_gen_dep flatcc_cli)
 endif()
 
@@ -134,42 +125,11 @@ add_library(
   bundled_program_schema INTERFACE ${_bundled_program_schema__outputs}
 )
 
-# Ensure the host tool is built before the main project
-add_dependencies(etdump_schema flatcc_cli)
-
 file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/sdk/etdump)
 file(MAKE_DIRECTORY
      ${_program_schema__include_dir}/executorch/sdk/bundled_program
 )
 
-if(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT)
-  # If we cross-compiling, we need to use the version of the commandline tool
-  # built for the host.
-  set(_etdump_schema_gen_dep flatcc_project)
-
-  # TODO(dbort): flatcc installs its files directly in its source directory
-  # instead of under CMAKE_BINARY_DIR, and it has no options to avoid doing
-  # this. We build flatcc twice in the executorch build: once to get the
-  # `flatcc` host commandline tool, and once to get the (potentially
-  # cross-compiled) target runtime library. The host build will put its outputs
-  # in the source tree, making the cross-compiling target build think that the
-  # outputs have already been built. It will then try to link against the
-  # host-architecture libraries, failing when cross-compiling. To work around
-  # this, delete the host outputs after running this command (which only runs
-  # when setting up the cmake files, not when actually building). This leaves
-  # room for the target build to put its own files in the source tree. We should
-  # try to remove this hack, ideally by submitting an upstream PR that adds an
-  # option to change the installation location.
-  set(_etdump_schema_cleanup_paths ${_flatcc_source_dir}/bin/*
-                                   ${_flatcc_source_dir}/lib/*
-  )
-else()
-  # If we're not cross-compiling we can use the plain commandline target, and we
-  # don't need to delete any files.
-  set(_etdump_schema_gen_dep flatcc_cli)
-  set(_etdump_schema_cleanup_paths "")
-endif()
-
 add_custom_command(
   OUTPUT ${_etdump_schema__outputs}
   COMMAND
@@ -179,13 +139,19 @@ add_custom_command(
     ${_flatcc_source_dir}/bin/flatcc -cwr -o
     ${_program_schema__include_dir}/executorch/sdk/etdump
     ${_etdump_schema__srcs}
-  COMMAND rm -f ${_etdump_schema_cleanup_paths}
-  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/sdk
   DEPENDS ${_etdump_schema_gen_dep}
+  WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/sdk
   COMMENT "Generating etdump headers"
   VERBATIM
 )
 
+add_custom_target(
+  etdump_schema_generated
+  DEPENDS ${_etdump_schema__outputs}
+)
+
+add_dependencies(etdump_schema etdump_schema_generated)
+
 add_library(
   etdump ${CMAKE_CURRENT_SOURCE_DIR}/etdump/etdump_flatcc.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/etdump/emitter.cpp