From f799ca479647340224fc987ca74432501bd3ec13 Mon Sep 17 00:00:00 2001 From: Hitesh Shah Date: Thu, 25 Apr 2024 18:16:01 -0700 Subject: [PATCH] Add sample code for GenAI model inferencing Also, update the packaging logic to include sample code and required runtime binaries. NOTE: Native (both CPP & CS) binaries are not yet being published by the GenAI team and needs revisit once those binaries are available for download. --- .lintrunner.toml | 19 ++- .../llama2/llama2_model_builder_template.json | 4 +- olive/engine/packaging/packaging_generator.py | 69 +++++++-- .../GenAIOnnxModel/cpp/CMakeLists.txt | 36 +++++ .../sample_code/GenAIOnnxModel/cpp/README.md | 21 +++ .../GenAIOnnxModel/cpp/code_sample.cpp | 94 ++++++++++++ .../sample_code/GenAIOnnxModel/cs/README.md | 7 + .../GenAIOnnxModel/cs/code_sample.cs | 73 +++++++++ .../cs/olive-genai-cs-sample.csproj | 25 +++ .../cs/olive-genai-cs-sample.sln | 25 +++ .../GenAIOnnxModel/python/README.md | 31 ++++ .../GenAIOnnxModel/python/code_sample.py | 144 ++++++++++++++++++ olive/passes/onnx/model_builder.py | 1 + 13 files changed, 528 insertions(+), 21 deletions(-) create mode 100644 olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/CMakeLists.txt create mode 100644 olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/README.md create mode 100644 olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/code_sample.cpp create mode 100644 olive/engine/packaging/sample_code/GenAIOnnxModel/cs/README.md create mode 100644 olive/engine/packaging/sample_code/GenAIOnnxModel/cs/code_sample.cs create mode 100644 olive/engine/packaging/sample_code/GenAIOnnxModel/cs/olive-genai-cs-sample.csproj create mode 100644 olive/engine/packaging/sample_code/GenAIOnnxModel/cs/olive-genai-cs-sample.sln create mode 100644 olive/engine/packaging/sample_code/GenAIOnnxModel/python/README.md create mode 100644 olive/engine/packaging/sample_code/GenAIOnnxModel/python/code_sample.py diff --git a/.lintrunner.toml b/.lintrunner.toml index 785915bf7..004c55f69 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -41,6 +41,9 @@ include_patterns = [ '**/*.py', '**/*.pyi' ] +exclude_patterns = [ + '**/olive/engine/packaging/sample_code' +] command = [ 'python', '-m', @@ -67,6 +70,7 @@ include_patterns = [ '**/*.py' ] exclude_patterns = [ + '**/olive/engine/packaging/sample_code' ] command = [ 'python', @@ -94,6 +98,7 @@ include_patterns = [ '**/*.py' ] exclude_patterns = [ + '**/olive/engine/packaging/sample_code' ] command = [ 'python', @@ -122,7 +127,8 @@ include_patterns = [ '**/*.pyi' ] exclude_patterns = [ - 'examples/pytorch/*.py' + 'examples/pytorch/*.py', + '**/olive/engine/packaging/sample_code' ] command = [ 'python', @@ -149,7 +155,9 @@ init_command = [ [[linter]] code = 'NOQA' include_patterns = ['**/*.py', '**/*.pyi'] -exclude_patterns = [] +exclude_patterns = [ + '**/olive/engine/packaging/sample_code' +] command = [ 'python', '-m', @@ -170,7 +178,9 @@ command = [ [[linter]] code = 'SPACES' include_patterns = ['**'] -exclude_patterns = [] +exclude_patterns = [ + '**/olive/engine/packaging/sample_code' +] command = [ 'python', '-m', @@ -194,7 +204,8 @@ include_patterns = ['**'] exclude_patterns = [ '.lintrunner.toml', '**/Makefile', - '**/*.bat' + '**/*.bat', + '**/olive/engine/packaging/sample_code' ] command = [ 'python', diff --git a/examples/llama2/llama2_model_builder_template.json b/examples/llama2/llama2_model_builder_template.json index d005eb6d3..8f7c5ab66 100644 --- a/examples/llama2/llama2_model_builder_template.json +++ b/examples/llama2/llama2_model_builder_template.json @@ -75,9 +75,7 @@ "packaging_config": [ { "type": "Zipfile", - "name": "OutputModel", - "include_runtime_packages": false, - "include_sample_code": false + "name": "OutputModel" } ], "log_severity_level": 0, diff --git a/olive/engine/packaging/packaging_generator.py b/olive/engine/packaging/packaging_generator.py index fe586df13..32f7fbada 100644 --- a/olive/engine/packaging/packaging_generator.py +++ b/olive/engine/packaging/packaging_generator.py @@ -296,11 +296,17 @@ def _package_candidate_models( tempdir = Path(temp_dir) if packaging_type == PackagingType.Zipfile: + best_node: FootprintNode = _get_best_candidate_node(pf_footprints, footprints) + model_attributes = best_node.model_config["config"].get("model_attributes") or {} + if packaging_config.include_sample_code: - _package_sample_code(Path(__file__).parent, tempdir) + _package_sample_code(Path(__file__).parent, tempdir, model_attributes.get("is_generative", False)) if packaging_config.include_runtime_packages: - _package_onnxruntime_packages(tempdir, next(iter(pf_footprints.values()))) + if model_attributes.get("is_generative", False): + _package_onnxruntime_genai_runtime_dependencies(tempdir) + else: + _package_onnxruntime_runtime_dependencies(tempdir, next(iter(pf_footprints.values()))) for accelerator_spec, pf_footprint in pf_footprints.items(): footprint = footprints[accelerator_spec] @@ -436,8 +442,9 @@ def _copy_models_rank(tempdir: Path, model_info_list: List[Dict]): f.write(json.dumps(model_info_list)) -def _package_sample_code(cur_path: Path, tempdir: Path): - copy_dir(cur_path / "sample_code", tempdir / "SampleCode") +def _package_sample_code(cur_path: Path, tempdir: Path, is_generative: bool): + subdir_name = "GenAIOnnxModel" if is_generative else "ONNXModel" + copy_dir(cur_path / "sample_code" / subdir_name, tempdir / "SampleCode") def _package_zipfile_model(output_dir: Path, output_name: str, model_dir: Path): @@ -565,7 +572,42 @@ def _generate_onnx_mlflow_model(model_dir: Path, inference_config: Dict): return mlflow_model_path -def _package_onnxruntime_packages(tempdir: Path, pf_footprint: "Footprint"): +def _package_onnxruntime_genai_runtime_dependencies(tempdir: Path): + # pylint: disable=not-an-iterable + installed_packages = [ + pkg + for pkg in pkg_resources.working_set + if pkg.key.startswith("onnxruntime-genai") or pkg.project_name.startswith("onnxruntime-genai") + ] + if not installed_packages: + logger.warning("ONNXRuntime-GenAI package is not installed. Skip packaging runtime packages.") + return + + DOWNLOAD_COMMAND_TEMPLATE = Template( + f"{sys.executable} -m pip download $package_name==$version --no-deps -d $python_download_path" + ) + python_download_path = tempdir / "ONNXRuntimePackages" / "python" + python_download_path.mkdir(parents=True, exist_ok=True) + python_download_path = str(python_download_path) + + for pkg in installed_packages: + pkg_name = pkg.key if pkg.key.startswith("onnxruntime-genai") else pkg.project_name + download_command = DOWNLOAD_COMMAND_TEMPLATE.substitute( + package_name=pkg_name, version=pkg.version, python_download_path=python_download_path + ) + + try: + run_subprocess(download_command) + except Exception: + logger.exception( + "Failed to download %s package. Manually download & install the required package.", pkg_name + ) + + # Download CPP && CS onnxruntime-genai packages + # TODO(olive-devteam): As of this writing the native packages aren't published. + + +def _package_onnxruntime_runtime_dependencies(tempdir: Path, pf_footprint: "Footprint"): # pylint: disable=not-an-iterable installed_packages = pkg_resources.working_set onnxruntime_pkg = [i for i in installed_packages if i.key.startswith("onnxruntime")] @@ -581,19 +623,18 @@ def _package_onnxruntime_packages(tempdir: Path, pf_footprint: "Footprint"): logger.warning("Both ONNXRuntime and ort-nightly packages are installed. Package ort-nightly package only.") ort_version = ort_nightly_pkg[0].version if is_nightly else onnxruntime_pkg[0].version + package_name_list = set() use_ort_extensions = False - for model_id in pf_footprint.nodes: if pf_footprint.get_use_ort_extensions(model_id): use_ort_extensions = True + inference_settings = pf_footprint.get_model_inference_config(model_id) - package_name_list = [] - if not inference_settings: - package_name_list.append(("onnxruntime", "ort-nightly")) - else: + if inference_settings: ep_list = inference_settings["execution_provider"] - package_name_list.extend([get_package_name_from_ep(ep[0]) for ep in ep_list]) - package_name_list = set(package_name_list) + package_name_list.update([get_package_name_from_ep(ep[0]) for ep in ep_list]) + else: + package_name_list.update(["onnxruntime", "ort-nightly"]) try: # Download Python onnxruntime package @@ -637,7 +678,7 @@ def _package_onnxruntime_packages(tempdir: Path, pf_footprint: "Footprint"): if is_nightly: _skip_download_c_package(ort_download_path) else: - _download_c_packages(package_name_list, ort_version, ort_download_path) + _download_native_onnx_packages(package_name_list, ort_version, ort_download_path) except Exception: logger.exception("Failed to download onnxruntime package. Please manually download onnxruntime package.") @@ -675,7 +716,7 @@ def _download_ort_extensions_package(use_ort_extensions: bool, download_path: st run_subprocess(download_command) -def _download_c_packages(package_name_list: List[str], ort_version: str, ort_download_path: str): +def _download_native_onnx_packages(package_name_list: List[str], ort_version: str, ort_download_path: str): PACKAGE_DOWNLOAD_LINK_MAPPING = { "onnxruntime": Template("https://www.nuget.org/api/v2/package/Microsoft.ML.OnnxRuntime/$ort_version"), "onnxruntime-gpu": Template("https://www.nuget.org/api/v2/package/Microsoft.ML.OnnxRuntime.Gpu/$ort_version"), diff --git a/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/CMakeLists.txt b/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/CMakeLists.txt new file mode 100644 index 000000000..3d60825ff --- /dev/null +++ b/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/CMakeLists.txt @@ -0,0 +1,36 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +cmake_minimum_required(VERSION 3.10) +project(olive-genai-cpp-sample) +set(CMAKE_CXX_STANDARD 20) + +add_executable(olive-genai-cpp-sample code_sample.cpp) +target_include_directories(olive-genai-cpp-sample + PRIVATE include + PRIVATE include/onnxruntime-genai +) +target_link_libraries(olive-genai-cpp-sample + PRIVATE onnxruntime-genai +) +target_link_directories(olive-genai-cpp-sample + PRIVATE lib +) + +if (MSVC) + # MSVC doesn't report correct value for __cplusplus without the explicit flag + # Ref: https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/ + target_compile_options(olive-genai-cpp-sample PRIVATE "/Zc:__cplusplus") + + add_custom_command(TARGET olive-genai-cpp-sample POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${PROJECT_SOURCE_DIR}/lib/onnxruntime.dll" + $ + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${PROJECT_SOURCE_DIR}/lib/onnxruntime-genai.dll" + $ + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${PROJECT_SOURCE_DIR}/lib/onnxruntime_providers_shared.dll" + $ + ) +endif() diff --git a/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/README.md b/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/README.md new file mode 100644 index 000000000..6390f409e --- /dev/null +++ b/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/README.md @@ -0,0 +1,21 @@ +# Olive sample code instructions + +## Prerequisites +Install the following: +* GCC 11.0 or higher for Linux +* Microsoft Visual Studio 2022 for Windows +* CMake + +## Building sample code +Run the following commands in the sample code's directory. +``` +mkdir build +cmake -S . -B build +cmake --build build +``` + +## Running the built binary +Run the following commands in the build directory. +``` +./olive-genai-cpp-sample +``` diff --git a/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/code_sample.cpp b/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/code_sample.cpp new file mode 100644 index 000000000..58934abb3 --- /dev/null +++ b/olive/engine/packaging/sample_code/GenAIOnnxModel/cpp/code_sample.cpp @@ -0,0 +1,94 @@ +#include "nlohmann/json.hpp" +#include "ort_genai.h" + +#include +#include +#include +#include + +namespace fs = std::filesystem; + +static void print_usage(int /*argc*/, char **argv) +{ + std::cerr << "usage: " << argv[0] << " model_path" << std::endl; +} + +bool load_search_options(const fs::path& dirpath, std::unique_ptr ¶ms) +{ + const fs::path config_filepath = dirpath / "genai_config.json"; + std::ifstream istrm(config_filepath); + if (!istrm.is_open()) return false; + + const nlohmann::json j = nlohmann::json::parse(istrm); + if (auto k = j.find("search"); k != j.end()) + { + if (auto it = k->find("diversity_penalty"); it != k->end()) params->SetSearchOption("diversity_penalty", *it); + if (auto it = k->find("do_sample"); it != k->end()) params->SetSearchOptionBool("do_sample", *it); + if (auto it = k->find("early_stopping"); it != k->end()) params->SetSearchOptionBool("early_stopping", *it); + if (auto it = k->find("length_penalty"); it != k->end()) params->SetSearchOption("length_penalty", *it); + if (auto it = k->find("max_length"); it != k->end()) params->SetSearchOption("max_length", *it); + if (auto it = k->find("min_length"); it != k->end()) params->SetSearchOption("min_length", *it); + if (auto it = k->find("no_repeat_ngram_size"); it != k->end()) params->SetSearchOption("no_repeat_ngram_size", *it); + if (auto it = k->find("num_beams"); it != k->end()) params->SetSearchOption("num_beams", *it); + if (auto it = k->find("num_return_sequences"); it != k->end()) params->SetSearchOption("num_return_sequences", *it); + if (auto it = k->find("past_present_share_buffer"); it != k->end()) params->SetSearchOptionBool("past_present_share_buffer", *it); + if (auto it = k->find("repetition_penalty"); it != k->end()) params->SetSearchOption("repetition_penalty", *it); + if (auto it = k->find("temperature"); it != k->end()) params->SetSearchOption("temperature", *it); + if (auto it = k->find("top_k"); it != k->end()) params->SetSearchOption("top_k", *it); + if (auto it = k->find("top_p"); it != k->end()) params->SetSearchOption("top_p", *it); + } + istrm.close(); + return true; +} + +int main(int argc, char **argv) +{ + if (argc != 2) + { + print_usage(argc, argv); + return -1; + } + + const char *const model_path = argv[1]; + + std::cout << "Loading model ..." << std::endl; + auto model = OgaModel::Create(model_path); + + std::cout << "Creating tokenizer ..." << std::endl; + auto tokenizer = OgaTokenizer::Create(*model); + + std::cout << "Loading genai_config.json ..." << std::endl; + auto params = OgaGeneratorParams::Create(*model); + + std::cout << "Evaluating generator params and search options ..." << std::endl; + load_search_options(model_path, params); + + const char* const prompt = "Who is Albert Einstein?"; + auto sequences = OgaSequences::Create(); + + std::cout << "Encoding prompt ..." << std::endl; + tokenizer->Encode(prompt, *sequences); + params->SetInputSequences(*sequences); + + std::cout << "Generating tokens ..." << std::endl; + auto start = std::chrono::high_resolution_clock::now(); + auto output_sequences = model->Generate(*params); + auto run_time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + + std::cout << "Decoding generated tokens ..." << std::endl; + auto out_sequences = output_sequences->Get(0); + auto out_string = tokenizer->Decode(out_sequences); + + std::cout << "Prompt: " << std::endl + << prompt << std::endl << std::endl; + std::cout << "Output: " << std::endl + << out_string << std::endl << std::endl; + + std::cout << std::setprecision(2) + << "Tokens: " << out_sequences.size() + << ", run_time: " << run_time.count() << " seconds" + << ", Tokens/sec: " << std::setprecision(2) << out_sequences.size() / (double)run_time.count() + << std::endl; + + return 0; +} diff --git a/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/README.md b/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/README.md new file mode 100644 index 000000000..095695239 --- /dev/null +++ b/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/README.md @@ -0,0 +1,7 @@ +# Olive sample code instructions + +## Prerequisites +Install Microsoft Visual Studio 2022 for Windows + +## Running the same code +Load the included Visual Studio solution, build, and run. diff --git a/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/code_sample.cs b/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/code_sample.cs new file mode 100644 index 000000000..2de62bb56 --- /dev/null +++ b/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/code_sample.cs @@ -0,0 +1,73 @@ +/* +------------------------------------------------------------------------- +Copyright (c) Microsoft Corporation. All rights reserved. +Licensed under the MIT License. +-------------------------------------------------------------------------- +*/ +using Microsoft.ML.OnnxRuntimeGenAI; +using Newtonsoft.Json; +using Newtonsoft.Json.Linq; +using System.Diagnostics; + +string modelPath = args[0]; + +Console.WriteLine("Loading model ..."); +using Model model = new(modelPath); + +Console.WriteLine("Creating tokenizer ..."); +using Tokenizer tokenizer = new(model); + +Console.WriteLine("Loading genai_config.json ..."); +string config_filepath = Path.Combine(modelPath, "genai_config.json"); +JObject? config; +using (StreamReader file = File.OpenText(config_filepath)) +{ + using JsonTextReader reader = new(file); + config = (JObject)JToken.ReadFrom(reader); +} + +Console.WriteLine("Evaluating generator params and search options ..."); +using GeneratorParams generatorParams = new(model); +JToken? search; +if (config.TryGetValue("search", out search)) +{ + foreach (var entry in (JObject)search) + { + switch (entry.Key) + { + case "diversity_penalty": generatorParams.SetSearchOption("diversity_penalty", double.Parse((string)entry.Value)); break; + case "do_sample": generatorParams.SetSearchOption("do_sample", bool.Parse((string)entry.Value)); break; + case "early_stopping": generatorParams.SetSearchOption("early_stopping", bool.Parse((string)entry.Value)); break; + case "length_penalty": generatorParams.SetSearchOption("length_penalty", double.Parse((string)entry.Value)); break; + case "max_length": generatorParams.SetSearchOption("max_length", double.Parse((string)entry.Value)); break; + case "min_length": generatorParams.SetSearchOption("min_length", double.Parse((string)entry.Value)); break; + case "no_repeat_ngram_size": generatorParams.SetSearchOption("no_repeat_ngram_size", double.Parse((string)entry.Value)); break; + case "num_beams": generatorParams.SetSearchOption("num_beams", double.Parse((string)entry.Value)); break; + case "num_return_sequences": generatorParams.SetSearchOption("num_return_sequences", double.Parse((string)entry.Value)); break; + case "past_present_share_buffer": generatorParams.SetSearchOption("past_present_share_buffer", bool.Parse((string)entry.Value)); break; + case "repetition_penalty": generatorParams.SetSearchOption("repetition_penalty", double.Parse((string)entry.Value)); break; + case "temperature": generatorParams.SetSearchOption("temperature", double.Parse((string)entry.Value)); break; + case "top_k": generatorParams.SetSearchOption("top_k", double.Parse((string)entry.Value)); break; + case "top_p": generatorParams.SetSearchOption("top_p", double.Parse((string)entry.Value)); break; + } + } +} + +Console.WriteLine("Encoding prompts ..."); +const string prompt = "Who is Albert Einstein?"; +using var sequences = tokenizer.Encode(prompt); +generatorParams.SetInputSequences(sequences); + +Console.WriteLine("Generating tokens ..."); +var watch = Stopwatch.StartNew(); +var outputSequences = model.Generate(generatorParams); +var runTime = watch.Elapsed.TotalSeconds; + +Console.WriteLine("Decoding generated tokens ..."); +var answer = tokenizer.Decode(outputSequences[0]); + +Console.WriteLine("Prompt: " + prompt); +Console.WriteLine("Output: " + answer); +Console.WriteLine( + string.Format("Tokens: {0}, Time: {1:.02} seconds, Tokens/second: {2:.02}", + outputSequences[0].Length, runTime, outputSequences[0].Length / (double)runTime)); diff --git a/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/olive-genai-cs-sample.csproj b/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/olive-genai-cs-sample.csproj new file mode 100644 index 000000000..c5748ff99 --- /dev/null +++ b/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/olive-genai-cs-sample.csproj @@ -0,0 +1,25 @@ + + + + Exe + net8.0 + olive_genai_cs_sample + enable + enable + + + + + + + + + + + + + libs\Microsoft.ML.OnnxRuntimeGenAI.dll + + + + diff --git a/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/olive-genai-cs-sample.sln b/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/olive-genai-cs-sample.sln new file mode 100644 index 000000000..86b115d71 --- /dev/null +++ b/olive/engine/packaging/sample_code/GenAIOnnxModel/cs/olive-genai-cs-sample.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.9.34728.123 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "olive-genai-cs-sample", "olive-genai-cs-sample.csproj", "{2AF3BED4-D369-49FF-8342-3C956EB79C96}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {2AF3BED4-D369-49FF-8342-3C956EB79C96}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {2AF3BED4-D369-49FF-8342-3C956EB79C96}.Debug|Any CPU.Build.0 = Debug|Any CPU + {2AF3BED4-D369-49FF-8342-3C956EB79C96}.Release|Any CPU.ActiveCfg = Release|Any CPU + {2AF3BED4-D369-49FF-8342-3C956EB79C96}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {FA7B3C72-534F-4A33-93C1-67EDFF333B8E} + EndGlobalSection +EndGlobal diff --git a/olive/engine/packaging/sample_code/GenAIOnnxModel/python/README.md b/olive/engine/packaging/sample_code/GenAIOnnxModel/python/README.md new file mode 100644 index 000000000..a9bb9bf2c --- /dev/null +++ b/olive/engine/packaging/sample_code/GenAIOnnxModel/python/README.md @@ -0,0 +1,31 @@ +# Olive sample code instructions + +## ONNXRuntime GenAI installation +Install onnxruntime-genai package: + +### install by pip (CPU) +``` +python -m pip install onnxruntime-genai +``` + +### install by pip (CUDA) +``` +python -m pip install onnxruntime-genai-cuda --pre --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/ +``` + +### install by pip (DirectML) +``` +python -m pip install onnxruntime-genai-directml +``` + +For updated instructions and/or configuring a locally built package, refer to the instructions [here](https://github.com/microsoft/onnxruntime-genai). + +## Running the same code +``` +python code_sample.py [prompt1, [prompt2, ...]] +``` + +For full list of available options, run the script in help mode. +``` +python code_sample.py -h +``` diff --git a/olive/engine/packaging/sample_code/GenAIOnnxModel/python/code_sample.py b/olive/engine/packaging/sample_code/GenAIOnnxModel/python/code_sample.py new file mode 100644 index 000000000..e4a325b4e --- /dev/null +++ b/olive/engine/packaging/sample_code/GenAIOnnxModel/python/code_sample.py @@ -0,0 +1,144 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +import argparse +import copy +import json +import os +import sys +import time + +import onnxruntime_genai as og + +# ruff: noqa + + +def _main(): + parser = argparse.ArgumentParser(description="End-to-end token generation loop example for gen-ai model") + parser.add_argument( + "model", type=str, help="Onnx model folder path (must contain genai_config.json and model.onnx)" + ) + parser.add_argument("-pr", "--prompts", nargs="*", required=False, help="Input prompts to generate tokens from") + parser.add_argument( + "--diversity_penalty", + type=float, + help="This value is subtracted from a beam’s score if it generates a token same as any beam from other group at a particular time. Note that diversity_penalty is only effective if group beam search is enabled.", + ) + parser.add_argument( + "--do_sample", type=bool, help="Whether or not to use sampling ; use greedy decoding otherwise." + ) + parser.add_argument( + "--early_stopping", + type=bool, + help="Whether to stop the beam search when at least num_beams sentences are finished per batch or not.", + ) + parser.add_argument( + "--length_penalty", + type=float, + help="Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.", + ) + parser.add_argument("--max_length", type=int, help="Max number of tokens to generate for each prompt") + parser.add_argument("--min_length", type=int, help="The minimum length of tokens to be generated for each prompt.") + parser.add_argument( + "--no_repeat_ngram_size", type=int, help=" If set to int > 0, all ngrams of that size can only occur once." + ) + parser.add_argument("--num_beams", type=int, help="Number of beams for beam search. 1 means no beam search.") + parser.add_argument( + "--num_return_sequences", + type=int, + help="The number of independently computed returned sequences for each element in the batch.", + ) + parser.add_argument( + "--past_present_share_buffer", + type=bool, + help="The past/present kv tensors are shared and allocated once to max_length (cuda only)", + ) + parser.add_argument( + "--repetition_penalty", + type=float, + help="Repetition penalty to sample with. The parameter for repetition penalty. 1.0 means no penalty.", + ) + parser.add_argument("--temperature", type=float, help="The value used to module the next token probabilities.") + parser.add_argument( + "--top_k", type=int, help="The number of highest probability vocabulary tokens to keep for top-k-filtering." + ) + parser.add_argument( + "--top_p", + type=float, + help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.", + ) + args = parser.parse_args() + + print("Loading model...") + model = og.Model(f"{args.model}") + + print("Creating tokenizer ...") + tokenizer = og.Tokenizer(model) + + print("Loading genai_config.json ...") + genai_config_filepath = os.path.join(args.model, "genai_config.json") + with open(genai_config_filepath) as strm: + genai_config = json.load(strm) + + print("Evaluating generator params and search options ...") + params = og.GeneratorParams(model) + + search_options: dict = copy.deepcopy(genai_config["search"]) + search_options.update( + { + name: getattr(args, name) + for name in [ + "diversity_penalty", + "do_sample", + "early_stopping", + "length_penalty", + "max_length", + "min_length", + "no_repeat_ngram_size", + "num_beams", + "num_return_sequences", + "past_present_share_buffer", + "repetition_penalty", + "temperature", + "top_k", + "top_p", + ] + if name in args and getattr(args, name) + } + ) + params.set_search_options(search_options) + + print("Encoding prompts ...") + if args.prompts is not None: + prompts = args.prompts + else: + prompts = ["I like walking my cute dog", "What is the best restaurant in town?", "Hello, how are you today?"] + params.input_ids = tokenizer.encode_batch(prompts) + + print("Generating tokens ...") + start_time = time.time() + output_tokens = model.generate(params) + run_time = time.time() - start_time + + print("Decoding generated tokens ...") + print() + output_token_count = 0 + for i, _ in enumerate(prompts): + print(f"Prompt #{i:02d}: {prompts[i]}") + print() + print(tokenizer.decode(output_tokens[i])) + print() + + output_token_count += len(output_tokens[i]) + + print() + print(f"Tokens: {output_token_count}, Time: {run_time:.2f}, Tokens per second: {output_token_count / run_time:.2f}") + print() + + return 0 + + +if __name__ == "__main__": + sys.exit(_main()) diff --git a/olive/passes/onnx/model_builder.py b/olive/passes/onnx/model_builder.py index eb454b6e1..d319f3191 100644 --- a/olive/passes/onnx/model_builder.py +++ b/olive/passes/onnx/model_builder.py @@ -220,6 +220,7 @@ def _run_for_config( model_attributes = copy.deepcopy(model.model_attributes or {}) model_attributes["additional_files"] = additional_files = model_attributes.get("additional_files", []) additional_files.extend(fp for fp in output_model_filepath.parent.iterdir() if fp not in filepaths_to_ignore) + model_attributes["is_generative"] = True if metadata_only: output_model = copy.copy(model)