Add sample code for GenAI model inferencing

Also, update the packaging logic to include sample code and required runtime binaries. NOTE: Native (both CPP & CS) binaries are not yet being published by the GenAI team and needs revisit once those binaries are available for download.
microsoft · May 2, 2024 · f799ca4 · f799ca4
1 parent d5337d5
commit f799ca4
Show file tree

Hide file tree

Showing 13 changed files with 528 additions and 21 deletions.
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -41,6 +41,9 @@ include_patterns = [
   '**/*.py',
   '**/*.pyi'
 ]
+exclude_patterns = [
+  '**/olive/engine/packaging/sample_code'
+]
 command = [
   'python',
   '-m',
@@ -67,6 +70,7 @@ include_patterns = [
   '**/*.py'
 ]
 exclude_patterns = [
+  '**/olive/engine/packaging/sample_code'
 ]
 command = [
   'python',
@@ -94,6 +98,7 @@ include_patterns = [
   '**/*.py'
 ]
 exclude_patterns = [
+  '**/olive/engine/packaging/sample_code'
 ]
 command = [
   'python',
@@ -122,7 +127,8 @@ include_patterns = [
   '**/*.pyi'
 ]
 exclude_patterns = [
-  'examples/pytorch/*.py'
+  'examples/pytorch/*.py',
+  '**/olive/engine/packaging/sample_code'
 ]
 command = [
   'python',
@@ -149,7 +155,9 @@ init_command = [
 [[linter]]
 code = 'NOQA'
 include_patterns = ['**/*.py', '**/*.pyi']
-exclude_patterns = []
+exclude_patterns = [
+  '**/olive/engine/packaging/sample_code'
+]
 command = [
   'python',
   '-m',
@@ -170,7 +178,9 @@ command = [
 [[linter]]
 code = 'SPACES'
 include_patterns = ['**']
-exclude_patterns = []
+exclude_patterns = [
+  '**/olive/engine/packaging/sample_code'
+]
 command = [
   'python',
   '-m',
@@ -194,7 +204,8 @@ include_patterns = ['**']
 exclude_patterns = [
   '.lintrunner.toml',
   '**/Makefile',
-  '**/*.bat'
+  '**/*.bat',
+  '**/olive/engine/packaging/sample_code'
 ]
 command = [
   'python',

diff --git a/examples/llama2/llama2_model_builder_template.json b/examples/llama2/llama2_model_builder_template.json
@@ -75,9 +75,7 @@
         "packaging_config": [
             {
                 "type": "Zipfile",
-                "name": "OutputModel",
-                "include_runtime_packages": false,
-                "include_sample_code": false
+                "name": "OutputModel"
             }
         ],
         "log_severity_level": 0,

diff --git a/olive/engine/packaging/packaging_generator.py b/olive/engine/packaging/packaging_generator.py
@@ -296,11 +296,17 @@ def _package_candidate_models(
         tempdir = Path(temp_dir)
 
         if packaging_type == PackagingType.Zipfile:
+            best_node: FootprintNode = _get_best_candidate_node(pf_footprints, footprints)
+            model_attributes = best_node.model_config["config"].get("model_attributes") or {}
+
             if packaging_config.include_sample_code:
-                _package_sample_code(Path(__file__).parent, tempdir)
+                _package_sample_code(Path(__file__).parent, tempdir, model_attributes.get("is_generative", False))
 
             if packaging_config.include_runtime_packages:
-                _package_onnxruntime_packages(tempdir, next(iter(pf_footprints.values())))
+                if model_attributes.get("is_generative", False):
+                    _package_onnxruntime_genai_runtime_dependencies(tempdir)
+                else:
+                    _package_onnxruntime_runtime_dependencies(tempdir, next(iter(pf_footprints.values())))
 
         for accelerator_spec, pf_footprint in pf_footprints.items():
             footprint = footprints[accelerator_spec]
@@ -436,8 +442,9 @@ def _copy_models_rank(tempdir: Path, model_info_list: List[Dict]):
         f.write(json.dumps(model_info_list))
 
 
-def _package_sample_code(cur_path: Path, tempdir: Path):
-    copy_dir(cur_path / "sample_code", tempdir / "SampleCode")
+def _package_sample_code(cur_path: Path, tempdir: Path, is_generative: bool):
+    subdir_name = "GenAIOnnxModel" if is_generative else "ONNXModel"
+    copy_dir(cur_path / "sample_code" / subdir_name, tempdir / "SampleCode")
 
 
 def _package_zipfile_model(output_dir: Path, output_name: str, model_dir: Path):
@@ -565,7 +572,42 @@ def _generate_onnx_mlflow_model(model_dir: Path, inference_config: Dict):
     return mlflow_model_path
 
 
-def _package_onnxruntime_packages(tempdir: Path, pf_footprint: "Footprint"):
+def _package_onnxruntime_genai_runtime_dependencies(tempdir: Path):
+    # pylint: disable=not-an-iterable
+    installed_packages = [
+        pkg
+        for pkg in pkg_resources.working_set
+        if pkg.key.startswith("onnxruntime-genai") or pkg.project_name.startswith("onnxruntime-genai")
+    ]
+    if not installed_packages:
+        logger.warning("ONNXRuntime-GenAI package is not installed. Skip packaging runtime packages.")
+        return
+
+    DOWNLOAD_COMMAND_TEMPLATE = Template(
+        f"{sys.executable} -m pip download $package_name==$version --no-deps -d $python_download_path"
+    )
+    python_download_path = tempdir / "ONNXRuntimePackages" / "python"
+    python_download_path.mkdir(parents=True, exist_ok=True)
+    python_download_path = str(python_download_path)
+
+    for pkg in installed_packages:
+        pkg_name = pkg.key if pkg.key.startswith("onnxruntime-genai") else pkg.project_name
+        download_command = DOWNLOAD_COMMAND_TEMPLATE.substitute(
+            package_name=pkg_name, version=pkg.version, python_download_path=python_download_path
+        )
+
+        try:
+            run_subprocess(download_command)
+        except Exception:
+            logger.exception(
+                "Failed to download %s package. Manually download & install the required package.", pkg_name
+            )
+
+        # Download CPP && CS onnxruntime-genai packages
+        # TODO(olive-devteam): As of this writing the native packages aren't published.
+
+
+def _package_onnxruntime_runtime_dependencies(tempdir: Path, pf_footprint: "Footprint"):
     # pylint: disable=not-an-iterable
     installed_packages = pkg_resources.working_set
     onnxruntime_pkg = [i for i in installed_packages if i.key.startswith("onnxruntime")]
@@ -581,19 +623,18 @@ def _package_onnxruntime_packages(tempdir: Path, pf_footprint: "Footprint"):
         logger.warning("Both ONNXRuntime and ort-nightly packages are installed. Package ort-nightly package only.")
 
     ort_version = ort_nightly_pkg[0].version if is_nightly else onnxruntime_pkg[0].version
+    package_name_list = set()
     use_ort_extensions = False
-
     for model_id in pf_footprint.nodes:
         if pf_footprint.get_use_ort_extensions(model_id):
             use_ort_extensions = True
+
         inference_settings = pf_footprint.get_model_inference_config(model_id)
-        package_name_list = []
-        if not inference_settings:
-            package_name_list.append(("onnxruntime", "ort-nightly"))
-        else:
+        if inference_settings:
             ep_list = inference_settings["execution_provider"]
-            package_name_list.extend([get_package_name_from_ep(ep[0]) for ep in ep_list])
-            package_name_list = set(package_name_list)
+            package_name_list.update([get_package_name_from_ep(ep[0]) for ep in ep_list])
+        else:
+            package_name_list.update(["onnxruntime", "ort-nightly"])
 
     try:
         # Download Python onnxruntime package
@@ -637,7 +678,7 @@ def _package_onnxruntime_packages(tempdir: Path, pf_footprint: "Footprint"):
             if is_nightly:
                 _skip_download_c_package(ort_download_path)
             else:
-                _download_c_packages(package_name_list, ort_version, ort_download_path)
+                _download_native_onnx_packages(package_name_list, ort_version, ort_download_path)
 
     except Exception:
         logger.exception("Failed to download onnxruntime package. Please manually download onnxruntime package.")
@@ -675,7 +716,7 @@ def _download_ort_extensions_package(use_ort_extensions: bool, download_path: st
             run_subprocess(download_command)
 
 
-def _download_c_packages(package_name_list: List[str], ort_version: str, ort_download_path: str):
+def _download_native_onnx_packages(package_name_list: List[str], ort_version: str, ort_download_path: str):
     PACKAGE_DOWNLOAD_LINK_MAPPING = {
         "onnxruntime": Template("https://www.nuget.org/api/v2/package/Microsoft.ML.OnnxRuntime/$ort_version"),
         "onnxruntime-gpu": Template("https://www.nuget.org/api/v2/package/Microsoft.ML.OnnxRuntime.Gpu/$ort_version"),

diff --git a/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/CMakeLists.txt b/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/CMakeLists.txt
@@ -0,0 +1,36 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+cmake_minimum_required(VERSION 3.10)
+project(olive-genai-cpp-sample)
+set(CMAKE_CXX_STANDARD 20)
+
+add_executable(olive-genai-cpp-sample code_sample.cpp)
+target_include_directories(olive-genai-cpp-sample
+    PRIVATE include
+    PRIVATE include/onnxruntime-genai
+)
+target_link_libraries(olive-genai-cpp-sample
+    PRIVATE onnxruntime-genai
+)
+target_link_directories(olive-genai-cpp-sample
+    PRIVATE lib
+)
+
+if (MSVC)
+  # MSVC doesn't report correct value for __cplusplus without the explicit flag
+  # Ref: https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
+  target_compile_options(olive-genai-cpp-sample PRIVATE "/Zc:__cplusplus")
+
+  add_custom_command(TARGET olive-genai-cpp-sample POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different
+          "${PROJECT_SOURCE_DIR}/lib/onnxruntime.dll"
+          $<TARGET_FILE_DIR:olive-genai-cpp-sample>
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different
+          "${PROJECT_SOURCE_DIR}/lib/onnxruntime-genai.dll"
+          $<TARGET_FILE_DIR:olive-genai-cpp-sample>
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different
+          "${PROJECT_SOURCE_DIR}/lib/onnxruntime_providers_shared.dll"
+          $<TARGET_FILE_DIR:olive-genai-cpp-sample>
+  )
+endif()
diff --git a/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/README.md b/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/README.md
@@ -0,0 +1,21 @@
+# Olive sample code instructions
+
+## Prerequisites
+Install the following:
+* GCC 11.0 or higher for Linux
+* Microsoft Visual Studio 2022 for Windows
+* CMake
+
+## Building sample code
+Run the following commands in the sample code's directory.
+```
+mkdir build
+cmake -S . -B build
+cmake --build build
+```
+
+## Running the built binary
+Run the following commands in the build directory.
+```
+./olive-genai-cpp-sample <Model's directory path>
+```
diff --git a/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/code_sample.cpp b/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/code_sample.cpp
@@ -0,0 +1,94 @@
+#include "nlohmann/json.hpp"
+#include "ort_genai.h"
+
+#include <chrono>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+
+namespace fs = std::filesystem;
+
+static void print_usage(int /*argc*/, char **argv)
+{
+    std::cerr << "usage: " << argv[0] << " model_path" << std::endl;
+}
+
+bool load_search_options(const fs::path& dirpath, std::unique_ptr<OgaGeneratorParams> &params)
+{
+    const fs::path config_filepath = dirpath / "genai_config.json";
+    std::ifstream istrm(config_filepath);
+    if (!istrm.is_open()) return false;
+
+    const nlohmann::json j = nlohmann::json::parse(istrm);
+    if (auto k = j.find("search"); k != j.end())
+    {
+        if (auto it = k->find("diversity_penalty"); it != k->end()) params->SetSearchOption("diversity_penalty", *it);
+        if (auto it = k->find("do_sample"); it != k->end()) params->SetSearchOptionBool("do_sample", *it);
+        if (auto it = k->find("early_stopping"); it != k->end()) params->SetSearchOptionBool("early_stopping", *it);
+        if (auto it = k->find("length_penalty"); it != k->end()) params->SetSearchOption("length_penalty", *it);
+        if (auto it = k->find("max_length"); it != k->end()) params->SetSearchOption("max_length", *it);
+        if (auto it = k->find("min_length"); it != k->end()) params->SetSearchOption("min_length", *it);
+        if (auto it = k->find("no_repeat_ngram_size"); it != k->end()) params->SetSearchOption("no_repeat_ngram_size", *it);
+        if (auto it = k->find("num_beams"); it != k->end()) params->SetSearchOption("num_beams", *it);
+        if (auto it = k->find("num_return_sequences"); it != k->end()) params->SetSearchOption("num_return_sequences", *it);
+        if (auto it = k->find("past_present_share_buffer"); it != k->end()) params->SetSearchOptionBool("past_present_share_buffer", *it);
+        if (auto it = k->find("repetition_penalty"); it != k->end()) params->SetSearchOption("repetition_penalty", *it);
+        if (auto it = k->find("temperature"); it != k->end()) params->SetSearchOption("temperature", *it);
+        if (auto it = k->find("top_k"); it != k->end()) params->SetSearchOption("top_k", *it);
+        if (auto it = k->find("top_p"); it != k->end()) params->SetSearchOption("top_p", *it);
+    }
+    istrm.close();
+    return true;
+}
+
+int main(int argc, char **argv)
+{
+    if (argc != 2)
+    {
+        print_usage(argc, argv);
+        return -1;
+    }
+
+    const char *const model_path = argv[1];
+
+    std::cout << "Loading model ..." << std::endl;
+    auto model = OgaModel::Create(model_path);
+
+    std::cout << "Creating tokenizer ..." << std::endl;
+    auto tokenizer = OgaTokenizer::Create(*model);
+
+    std::cout << "Loading genai_config.json ..." << std::endl;
+    auto params = OgaGeneratorParams::Create(*model);
+
+    std::cout << "Evaluating generator params and search options ..." << std::endl;
+    load_search_options(model_path, params);
+
+    const char* const prompt = "Who is Albert Einstein?";
+    auto sequences = OgaSequences::Create();
+
+    std::cout << "Encoding prompt ..." << std::endl;
+    tokenizer->Encode(prompt, *sequences);
+    params->SetInputSequences(*sequences);
+
+    std::cout << "Generating tokens ..." << std::endl;
+    auto start = std::chrono::high_resolution_clock::now();
+    auto output_sequences = model->Generate(*params);
+    auto run_time = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - start);
+
+    std::cout << "Decoding generated tokens ..." << std::endl;
+    auto out_sequences = output_sequences->Get(0);
+    auto out_string = tokenizer->Decode(out_sequences);
+
+    std::cout << "Prompt: " << std::endl
+              << prompt << std::endl << std::endl;
+    std::cout << "Output: " << std::endl
+              << out_string << std::endl << std::endl;
+
+    std::cout << std::setprecision(2)
+              << "Tokens: " << out_sequences.size()
+              << ", run_time: " << run_time.count() << " seconds"
+              << ", Tokens/sec: " << std::setprecision(2) << out_sequences.size() / (double)run_time.count()
+              << std::endl;
+
+    return 0;
+}
diff --git a/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/README.md b/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/README.md
@@ -0,0 +1,7 @@
+# Olive sample code instructions
+
+## Prerequisites
+Install Microsoft Visual Studio 2022 for Windows
+
+## Running the same code
+Load the included Visual Studio solution, build, and run.