From 37ab63a8f1e92690ee1b7205e8c4ff3912aadf67 Mon Sep 17 00:00:00 2001
From: Devang Patel <47577486+devang-ml@users.noreply.github.com>
Date: Thu, 9 May 2024 13:31:45 -0700
Subject: [PATCH 01/20] Add a pass to convert fp16 io to fp32

---
 docs/source/api/passes.rst              |   6 ++
 docs/source/features/passes/onnx.md     |  13 +++
 olive/hardware/constants.py             |   1 +
 olive/olive_config.json                 |   3 +
 olive/passes/onnx/float32_conversion.py | 120 ++++++++++++++++++++++++
 olive/passes/onnx/model_builder.py      |   2 +
 6 files changed, 145 insertions(+)
 create mode 100644 olive/passes/onnx/float32_conversion.py

diff --git a/docs/source/api/passes.rst b/docs/source/api/passes.rst
index 547994bd2..faf6e577e 100644
--- a/docs/source/api/passes.rst
+++ b/docs/source/api/passes.rst
@@ -42,6 +42,12 @@ OnnxFloatToFloat16
 --------------------
 .. autoconfigclass:: olive.passes.OnnxFloatToFloat16
 
+.. _onnx_io_float16_to_float32:
+
+OnnxIOFloat16ToFloat32
+--------------------
+.. autoconfigclass:: olive.passes.OnnxIOFloat16ToFloat32
+
 .. _ort_mixed_precision:
 
 OrtMixedPrecision
diff --git a/docs/source/features/passes/onnx.md b/docs/source/features/passes/onnx.md
index 05d7d689b..a22bc30a4 100644
--- a/docs/source/features/passes/onnx.md
+++ b/docs/source/features/passes/onnx.md
@@ -428,6 +428,19 @@ b. More fine-grained control of the conversion conditions is also possible:
 
 See [Float16 Conversion](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#float16-conversion) for more detailed description of the available configuration parameters.
 
+## Inputs/Outputs Float16 to Float32 Conversion
+
+Certain environments such as Onnxruntime WebGPU prefers Float32 logits. The `OnnxIOFloat16ToFloat32` pass converts the inputs and outputs to use Float16 instead of Float32.
+
+### Example Configuration
+
+a. The most basic configuration, which is suitable for many models, leaves all configuration options set to their default values:
+```json
+{
+    "type": "OnnxIOFloat16ToFloat32"
+}
+```
+
 ## Mixed Precision Conversion
 Converting model to mixed precision.
 
diff --git a/olive/hardware/constants.py b/olive/hardware/constants.py
index 6f002f719..9146e3891 100644
--- a/olive/hardware/constants.py
+++ b/olive/hardware/constants.py
@@ -28,6 +28,7 @@
         "MIGraphXExecutionProvider",
         "TensorrtExecutionProvider",
         "OpenVINOExecutionProvider",
+        "JsExecutionProvider"
     ],
     "npu": ["QNNExecutionProvider"],
 }
diff --git a/olive/olive_config.json b/olive/olive_config.json
index 1825342e2..aca50c927 100644
--- a/olive/olive_config.json
+++ b/olive/olive_config.json
@@ -48,6 +48,9 @@
         "OnnxFloatToFloat16": {
             "module_path": "olive.passes.onnx.float16_conversion.OnnxFloatToFloat16"
         },
+        "OnnxIOFloat16ToFloat32": {
+            "module_path": "olive.passes.onnx.float16_conversion.OnnxIOFloat16ToFloat32"
+        },
         "OnnxMatMul4Quantizer": {
             "module_path": "olive.passes.onnx.quantization.OnnxMatMul4Quantizer"
         },
diff --git a/olive/passes/onnx/float32_conversion.py b/olive/passes/onnx/float32_conversion.py
new file mode 100644
index 000000000..8e52d465a
--- /dev/null
+++ b/olive/passes/onnx/float32_conversion.py
@@ -0,0 +1,120 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from pathlib import Path
+from typing import Any, Dict, List
+
+from collections import defaultdict
+import onnx
+from olive.hardware.accelerator import AcceleratorSpec
+from olive.model import ONNXModelHandler
+from olive.model.utils import resolve_onnx_path
+from olive.passes import Pass
+from olive.passes.onnx.common import get_external_data_config, model_proto_to_olive_model
+from olive.passes.pass_config import PassConfigParam
+import re
+
+class OnnxIOFloat16ToFloat32(Pass):
+    """Converts float16 model inputs/outputs to float32.
+
+    """
+
+    @classmethod
+    def _default_config(cls, accelerator_spec: AcceleratorSpec) -> Dict[str, PassConfigParam]:
+        config = {
+            "name_pattern": PassConfigParam(
+                type_=List[str], default_value="logits", description="Only convert inputs/outputs whose name matches this pattern"
+            )
+        }
+        config.update(get_external_data_config())
+        return config
+
+    def create_io_mapping(graph, i_map, o_map):
+        for n in graph.node:
+            for i in n.input:
+                i_map[i].append(n)
+        for n in graph.node:
+            for o in n.output:
+                assert o not in o_map[o]
+                o_map[o] = [n]
+
+    def wrap_inputs(graph, i_map, names):
+        # 1. find fp16 inputs
+        # 2. rewrite all consumers
+        # 3. insert cast
+        # 4. rewrite graph inputs
+        inputs = [n for n in graph.input if n.type.tensor_type.elem_type == onnx.TensorProto.FLOAT16]
+        for i in inputs:
+            if names:
+                match = names.search(i.name)
+                if not match:
+                    continue
+            print(f"input {i.name} from fp32")
+            for n in i_map[i.name]:
+                for j, o in enumerate(n.input):
+                    if o == i.name:
+                        n.input[j] = i.name + "_fp16"
+            cast = onnx.helper.make_node(
+                "Cast",
+                inputs=[i.name],
+                outputs=[i.name + "_fp16"],
+                to=onnx.TensorProto.FLOAT16,
+            )
+            graph.node.insert(0, cast)
+            i.type.tensor_type.elem_type = onnx.TensorProto.FLOAT
+
+
+    def wrap_outputs(graph, i_map, o_map, names):
+        # 1. find fp16 outputs
+        # 2. rewrite all providers
+        # 3. append cast
+        # 4. rewrite graph outputs
+        outputs = [n for n in graph.output if n.type.tensor_type.elem_type == onnx.TensorProto.FLOAT16]
+        for o in outputs:
+            if names:
+                match = names.search(o.name)
+                if not match:
+                    continue
+            print(f"output {o.name} to fp32")
+            for n in o_map[o.name]:
+                for j, i in enumerate(n.output):
+                    if i == o.name:
+                        n.output[j] = o.name + "_fp16"
+            for n in i_map[o.name]:
+                for j, i in enumerate(n.input):
+                    if i == o.name:
+                        n.input[j] = o.name + "_fp16"
+
+            cast = onnx.helper.make_node(
+                "Cast",
+                inputs=[o.name + "_fp16"],
+                outputs=[o.name],
+                to=onnx.TensorProto.FLOAT,
+            )
+            graph.node.append(cast)
+            o.type.tensor_type.elem_type = onnx.TensorProto.FLOAT
+
+    def _run_for_config(
+        self, model: ONNXModelHandler, data_root: str, config: Dict[str, Any], output_model_path: str
+    ) -> ONNXModelHandler:
+        from onnxruntime.transformers.onnx_model import OnnxModel
+
+        output_model_path = resolve_onnx_path(output_model_path, Path(model.model_path).name)
+
+        ort_onnx_model = OnnxModel(model.load_model())
+
+        i_map = defaultdict(list)
+        o_map = defaultdict(list)
+
+        self.create_io_mapping(model.graph, i_map, o_map)
+
+        pat = None
+        if args.name:
+            pat = re.compile(args.name)
+
+        self.wrap_inputs(model.graph, i_map, pat)
+        self.wrap_outputs(model.graph, i_map, o_map, pat)
+
+        # save the model to the output path and return the model
+        return model_proto_to_olive_model(ort_onnx_model.model, output_model_path, config)
diff --git a/olive/passes/onnx/model_builder.py b/olive/passes/onnx/model_builder.py
index 08bfbed38..a3e7f5cf2 100644
--- a/olive/passes/onnx/model_builder.py
+++ b/olive/passes/onnx/model_builder.py
@@ -155,6 +155,8 @@ def _run_for_config(
             target_execution_provider = "dml"
         elif self.accelerator_spec.execution_provider == "CUDAExecutionProvider":
             target_execution_provider = "cuda"
+        elif self.accelerator_spec.execution_provider == "JsExecutionProvider":
+            target_execution_provider = "web"
         else:
             target_execution_provider = "cpu"
 

From f6fe26b1343d7b88714785bd32e4b2251b17a18a Mon Sep 17 00:00:00 2001
From: Devang Patel <47577486+devang-ml@users.noreply.github.com>
Date: Thu, 9 May 2024 13:59:06 -0700
Subject: [PATCH 02/20] Update phi2 example

---
 examples/phi2/phi2.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/examples/phi2/phi2.py b/examples/phi2/phi2.py
index 5643b03de..cd4346db6 100644
--- a/examples/phi2/phi2.py
+++ b/examples/phi2/phi2.py
@@ -22,6 +22,7 @@
     "cuda_fp16": [["convert", "optimize_cuda", "perf_tuning"]],
     "cuda_int4": [["convert", "optimize_cuda", "blockwise_quant_int4", "perf_tuning"]],
     "slicegpt": [["slice"]],
+    "web": [["builder", "io_float16_to_float32"]],
 }
 SUPPORTED_INFERENCE_CONFIG = {
     "cpu_fp32": {
@@ -54,6 +55,7 @@
 DEVICE_TO_EP = {
     "cpu": "CPUExecutionProvider",
     "gpu": "CUDAExecutionProvider",
+    "web": "JsExecutionProvider",
 }
 
 
@@ -64,8 +66,8 @@ def get_args(raw_args):
         "--model_type",
         type=str,
         default=None,
-        choices=["cpu_fp32", "cpu_int4", "cuda_fp16", "cuda_int4"],
-        help="Choose from cpu_fp32, cpu_int4, cuda_fp16, cuda_int4",
+        choices=["cpu_fp32", "cpu_int4", "cuda_fp16", "cuda_int4", "web"],
+        help="Choose from cpu_fp32, cpu_int4, cuda_fp16, cuda_int4 or web",
     )
     parser.add_argument(
         "--finetune_method",
@@ -140,12 +142,21 @@ def main(raw_args=None):
             template_json["passes"]["builder"]["config"]["precision"] = precision
             template_json["systems"]["local_system"]["config"]["accelerators"] = [
                 {"device": device, "execution_providers": [DEVICE_TO_EP[device.lower()]]}
+             ]
+        new_json_file = f"phi2_web.json"
+        with open(new_json_file, "w") as f:
+            json.dump(template_json, f, indent=4)
+    elif model_type == "web":
+        json_file_template = "phi2_genai.json"
+        with open(json_file_template) as f:
+            template_json = json.load(f)
+            template_json["passes"]["builder"]["config"]["precision"] = "int4"
+            template_json["systems"]["local_system"]["config"]["accelerators"] = [
+                {"device": "GPU", "execution_providers": ["JsExecutionProvider"]}
             ]
-
-        new_json_file = f"phi2_genai_{device.lower()}.json"
+        new_json_file = f"phi2_web.json"
         with open(new_json_file, "w") as f:
             json.dump(template_json, f, indent=4)
-
     else:
         if not args.optimum_optimization and not args.slicegpt and version.parse(OrtVersion) < version.parse("1.18.0"):
             # Check if onnxruntime version is supported

From 601f83b7db3f86db5bd45c5bd4fe8baf499991ba Mon Sep 17 00:00:00 2001
From: Devang Patel <47577486+devang-ml@users.noreply.github.com>
Date: Thu, 9 May 2024 16:24:07 -0700
Subject: [PATCH 03/20] Add convert fp16 io to fp32 pass to webGPU example

---
 examples/phi2/phi2.py                   |  2 ++
 olive/olive_config.json                 |  2 +-
 olive/passes/onnx/float32_conversion.py | 22 +++++++++++++---------
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/examples/phi2/phi2.py b/examples/phi2/phi2.py
index cd4346db6..a997df30a 100644
--- a/examples/phi2/phi2.py
+++ b/examples/phi2/phi2.py
@@ -154,6 +154,8 @@ def main(raw_args=None):
             template_json["systems"]["local_system"]["config"]["accelerators"] = [
                 {"device": "GPU", "execution_providers": ["JsExecutionProvider"]}
             ]
+            fl_type = { "type" : "OnnxIOFloat16ToFloat32"}
+            template_json["passes"]["fp32_logits"] = fl_type
         new_json_file = f"phi2_web.json"
         with open(new_json_file, "w") as f:
             json.dump(template_json, f, indent=4)
diff --git a/olive/olive_config.json b/olive/olive_config.json
index aca50c927..a7276fdc3 100644
--- a/olive/olive_config.json
+++ b/olive/olive_config.json
@@ -49,7 +49,7 @@
             "module_path": "olive.passes.onnx.float16_conversion.OnnxFloatToFloat16"
         },
         "OnnxIOFloat16ToFloat32": {
-            "module_path": "olive.passes.onnx.float16_conversion.OnnxIOFloat16ToFloat32"
+            "module_path": "olive.passes.onnx.float32_conversion.OnnxIOFloat16ToFloat32"
         },
         "OnnxMatMul4Quantizer": {
             "module_path": "olive.passes.onnx.quantization.OnnxMatMul4Quantizer"
diff --git a/olive/passes/onnx/float32_conversion.py b/olive/passes/onnx/float32_conversion.py
index 8e52d465a..6c63b691f 100644
--- a/olive/passes/onnx/float32_conversion.py
+++ b/olive/passes/onnx/float32_conversion.py
@@ -24,13 +24,17 @@ class OnnxIOFloat16ToFloat32(Pass):
     def _default_config(cls, accelerator_spec: AcceleratorSpec) -> Dict[str, PassConfigParam]:
         config = {
             "name_pattern": PassConfigParam(
-                type_=List[str], default_value="logits", description="Only convert inputs/outputs whose name matches this pattern"
+                type_=str, default_value="logits",
+                description=(
+                    "Only convert inputs/outputs whose name matches this pattern. By default"
+                    "looking for logits names"
+                )
             )
         }
         config.update(get_external_data_config())
         return config
 
-    def create_io_mapping(graph, i_map, o_map):
+    def create_io_mapping(self, graph, i_map, o_map):
         for n in graph.node:
             for i in n.input:
                 i_map[i].append(n)
@@ -39,7 +43,7 @@ def create_io_mapping(graph, i_map, o_map):
                 assert o not in o_map[o]
                 o_map[o] = [n]
 
-    def wrap_inputs(graph, i_map, names):
+    def wrap_inputs(self, graph, i_map, names):
         # 1. find fp16 inputs
         # 2. rewrite all consumers
         # 3. insert cast
@@ -65,7 +69,7 @@ def wrap_inputs(graph, i_map, names):
             i.type.tensor_type.elem_type = onnx.TensorProto.FLOAT
 
 
-    def wrap_outputs(graph, i_map, o_map, names):
+    def wrap_outputs(self, graph, i_map, o_map, names):
         # 1. find fp16 outputs
         # 2. rewrite all providers
         # 3. append cast
@@ -107,14 +111,14 @@ def _run_for_config(
         i_map = defaultdict(list)
         o_map = defaultdict(list)
 
-        self.create_io_mapping(model.graph, i_map, o_map)
+        self.create_io_mapping(ort_onnx_model.model.graph, i_map, o_map)
 
         pat = None
-        if args.name:
-            pat = re.compile(args.name)
+        if config["name_pattern"]:
+            pat = re.compile(config["name_pattern"])
 
-        self.wrap_inputs(model.graph, i_map, pat)
-        self.wrap_outputs(model.graph, i_map, o_map, pat)
+        self.wrap_inputs(ort_onnx_model.model.graph, i_map, pat)
+        self.wrap_outputs(ort_onnx_model.model.graph, i_map, o_map, pat)
 
         # save the model to the output path and return the model
         return model_proto_to_olive_model(ort_onnx_model.model, output_model_path, config)

From d7991ac6fb7140c30065567adc127be74784cdf4 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Mon, 13 May 2024 08:22:19 +0000
Subject: [PATCH 04/20] Add unit test

---
 olive/hardware/constants.py                   |  2 +-
 olive/passes/onnx/float32_conversion.py       | 32 +++++++------
 .../passes/onnx/test_float32_conversion.py    | 47 +++++++++++++++++++
 3 files changed, 65 insertions(+), 16 deletions(-)
 create mode 100644 test/unit_test/passes/onnx/test_float32_conversion.py

diff --git a/olive/hardware/constants.py b/olive/hardware/constants.py
index 9146e3891..2856fd120 100644
--- a/olive/hardware/constants.py
+++ b/olive/hardware/constants.py
@@ -28,7 +28,7 @@
         "MIGraphXExecutionProvider",
         "TensorrtExecutionProvider",
         "OpenVINOExecutionProvider",
-        "JsExecutionProvider"
+        "JsExecutionProvider",
     ],
     "npu": ["QNNExecutionProvider"],
 }
diff --git a/olive/passes/onnx/float32_conversion.py b/olive/passes/onnx/float32_conversion.py
index 6c63b691f..211a5ce12 100644
--- a/olive/passes/onnx/float32_conversion.py
+++ b/olive/passes/onnx/float32_conversion.py
@@ -2,33 +2,36 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
+import logging
+import re
+from collections import defaultdict
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict
 
-from collections import defaultdict
 import onnx
+
 from olive.hardware.accelerator import AcceleratorSpec
 from olive.model import ONNXModelHandler
 from olive.model.utils import resolve_onnx_path
 from olive.passes import Pass
 from olive.passes.onnx.common import get_external_data_config, model_proto_to_olive_model
 from olive.passes.pass_config import PassConfigParam
-import re
 
-class OnnxIOFloat16ToFloat32(Pass):
-    """Converts float16 model inputs/outputs to float32.
+logger = logging.getLogger(__name__)
+
 
-    """
+class OnnxIOFloat16ToFloat32(Pass):
+    """Converts float16 model inputs/outputs to float32."""
 
     @classmethod
     def _default_config(cls, accelerator_spec: AcceleratorSpec) -> Dict[str, PassConfigParam]:
         config = {
             "name_pattern": PassConfigParam(
-                type_=str, default_value="logits",
+                type_=str,
+                default_value="logits",
                 description=(
-                    "Only convert inputs/outputs whose name matches this pattern. By default"
-                    "looking for logits names"
-                )
+                    "Only convert inputs/outputs whose name matches this pattern. By defaultlooking for logits names"
+                ),
             )
         }
         config.update(get_external_data_config())
@@ -40,7 +43,7 @@ def create_io_mapping(self, graph, i_map, o_map):
                 i_map[i].append(n)
         for n in graph.node:
             for o in n.output:
-                assert o not in o_map[o]
+                assert o not in o_map
                 o_map[o] = [n]
 
     def wrap_inputs(self, graph, i_map, names):
@@ -54,7 +57,7 @@ def wrap_inputs(self, graph, i_map, names):
                 match = names.search(i.name)
                 if not match:
                     continue
-            print(f"input {i.name} from fp32")
+            logger.debug("input %s from fp32", i.name)
             for n in i_map[i.name]:
                 for j, o in enumerate(n.input):
                     if o == i.name:
@@ -63,12 +66,11 @@ def wrap_inputs(self, graph, i_map, names):
                 "Cast",
                 inputs=[i.name],
                 outputs=[i.name + "_fp16"],
-                to=onnx.TensorProto.FLOAT16,
+                to=onnx.TensorProto.FLOAT,
             )
             graph.node.insert(0, cast)
             i.type.tensor_type.elem_type = onnx.TensorProto.FLOAT
 
-
     def wrap_outputs(self, graph, i_map, o_map, names):
         # 1. find fp16 outputs
         # 2. rewrite all providers
@@ -80,7 +82,7 @@ def wrap_outputs(self, graph, i_map, o_map, names):
                 match = names.search(o.name)
                 if not match:
                     continue
-            print(f"output {o.name} to fp32")
+            logger.debug("output %s from fp32", o.name)
             for n in o_map[o.name]:
                 for j, i in enumerate(n.output):
                     if i == o.name:
diff --git a/test/unit_test/passes/onnx/test_float32_conversion.py b/test/unit_test/passes/onnx/test_float32_conversion.py
new file mode 100644
index 000000000..a0385aeee
--- /dev/null
+++ b/test/unit_test/passes/onnx/test_float32_conversion.py
@@ -0,0 +1,47 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from olive.model.handler.onnx import ONNXModelHandler
+from olive.passes.olive_pass import create_pass_from_dict
+from olive.passes.onnx.float32_conversion import OnnxIOFloat16ToFloat32
+from test.unit_test.utils import get_onnx_model
+import onnx
+from onnx import helper
+from onnx import TensorProto
+
+
+def test_onnx_io_ft16_to_ft32_conversion(tmp_path):
+    # setup
+    node1 = helper.make_node(
+        'Add',
+        ['logits_A', 'logits_B'],
+        ['logits_C'],
+        name='add_node'
+    )
+
+    input_tensor_A = helper.make_tensor_value_info('logits_A', TensorProto.FLOAT16, [None])
+    input_tensor_B = helper.make_tensor_value_info('logits_B', TensorProto.FLOAT16, [None])
+    output_tensor_C = helper.make_tensor_value_info('logits_C', TensorProto.FLOAT16, [None])
+
+    graph = helper.make_graph(
+        [node1], 
+        'example_graph', 
+        [input_tensor_A, input_tensor_B], 
+        [output_tensor_C]
+    )
+    onnx_model = helper.make_model(graph, producer_name='example_producer')
+    tmp_model_path = str(tmp_path / "model.onnx")
+    onnx.save(onnx_model, tmp_model_path)
+    input_model = ONNXModelHandler(model_path=tmp_model_path)
+    p = create_pass_from_dict(OnnxIOFloat16ToFloat32, None, disable_search=True)
+    output_folder = str(tmp_path / "onnx")
+
+    # execute
+    output_model = p.run(input_model, None, output_folder)
+
+    # assert
+    for input in output_model.get_graph().input:
+        assert input.type.tensor_type.elem_type == onnx.TensorProto.FLOAT
+    for output in output_model.get_graph().output:
+        assert output.type.tensor_type.elem_type == onnx.TensorProto.FLOAT
\ No newline at end of file

From 2607620630552158f1f0e33b94cbc562d05daa8b Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Mon, 13 May 2024 17:15:38 +0000
Subject: [PATCH 05/20] fix test

---
 .../passes/onnx/test_float32_conversion.py    | 37 +++++++------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/test/unit_test/passes/onnx/test_float32_conversion.py b/test/unit_test/passes/onnx/test_float32_conversion.py
index a0385aeee..31bb231de 100644
--- a/test/unit_test/passes/onnx/test_float32_conversion.py
+++ b/test/unit_test/passes/onnx/test_float32_conversion.py
@@ -2,35 +2,24 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
+import onnx
+from onnx import TensorProto, helper
+
 from olive.model.handler.onnx import ONNXModelHandler
 from olive.passes.olive_pass import create_pass_from_dict
 from olive.passes.onnx.float32_conversion import OnnxIOFloat16ToFloat32
-from test.unit_test.utils import get_onnx_model
-import onnx
-from onnx import helper
-from onnx import TensorProto
 
 
 def test_onnx_io_ft16_to_ft32_conversion(tmp_path):
     # setup
-    node1 = helper.make_node(
-        'Add',
-        ['logits_A', 'logits_B'],
-        ['logits_C'],
-        name='add_node'
-    )
+    node1 = helper.make_node("Add", ["logits_a", "logits_b"], ["logits_c"], name="add_node")
 
-    input_tensor_A = helper.make_tensor_value_info('logits_A', TensorProto.FLOAT16, [None])
-    input_tensor_B = helper.make_tensor_value_info('logits_B', TensorProto.FLOAT16, [None])
-    output_tensor_C = helper.make_tensor_value_info('logits_C', TensorProto.FLOAT16, [None])
+    input_tensor_a = helper.make_tensor_value_info("logits_a", TensorProto.FLOAT16, [None])
+    input_tensor_b = helper.make_tensor_value_info("logits_b", TensorProto.FLOAT16, [None])
+    output_tensor_c = helper.make_tensor_value_info("logits_c", TensorProto.FLOAT16, [None])
 
-    graph = helper.make_graph(
-        [node1], 
-        'example_graph', 
-        [input_tensor_A, input_tensor_B], 
-        [output_tensor_C]
-    )
-    onnx_model = helper.make_model(graph, producer_name='example_producer')
+    graph = helper.make_graph([node1], "example_graph", [input_tensor_a, input_tensor_b], [output_tensor_c])
+    onnx_model = helper.make_model(graph, producer_name="example_producer")
     tmp_model_path = str(tmp_path / "model.onnx")
     onnx.save(onnx_model, tmp_model_path)
     input_model = ONNXModelHandler(model_path=tmp_model_path)
@@ -41,7 +30,7 @@ def test_onnx_io_ft16_to_ft32_conversion(tmp_path):
     output_model = p.run(input_model, None, output_folder)
 
     # assert
-    for input in output_model.get_graph().input:
-        assert input.type.tensor_type.elem_type == onnx.TensorProto.FLOAT
-    for output in output_model.get_graph().output:
-        assert output.type.tensor_type.elem_type == onnx.TensorProto.FLOAT
\ No newline at end of file
+    for i in output_model.get_graph().input:
+        assert i.type.tensor_type.elem_type == onnx.TensorProto.FLOAT
+    for o in output_model.get_graph().output:
+        assert o.type.tensor_type.elem_type == onnx.TensorProto.FLOAT

From 93bea01ab52b51569de7c5f476aee62b7dd56f90 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Mon, 13 May 2024 17:16:12 +0000
Subject: [PATCH 06/20] fix example

---
 examples/phi2/phi2.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/phi2/phi2.py b/examples/phi2/phi2.py
index a997df30a..628f1a5b8 100644
--- a/examples/phi2/phi2.py
+++ b/examples/phi2/phi2.py
@@ -142,8 +142,8 @@ def main(raw_args=None):
             template_json["passes"]["builder"]["config"]["precision"] = precision
             template_json["systems"]["local_system"]["config"]["accelerators"] = [
                 {"device": device, "execution_providers": [DEVICE_TO_EP[device.lower()]]}
-             ]
-        new_json_file = f"phi2_web.json"
+            ]
+        new_json_file = "phi2_web.json"
         with open(new_json_file, "w") as f:
             json.dump(template_json, f, indent=4)
     elif model_type == "web":
@@ -154,9 +154,9 @@ def main(raw_args=None):
             template_json["systems"]["local_system"]["config"]["accelerators"] = [
                 {"device": "GPU", "execution_providers": ["JsExecutionProvider"]}
             ]
-            fl_type = { "type" : "OnnxIOFloat16ToFloat32"}
+            fl_type = {"type": "OnnxIOFloat16ToFloat32"}
             template_json["passes"]["fp32_logits"] = fl_type
-        new_json_file = f"phi2_web.json"
+        new_json_file = "phi2_web.json"
         with open(new_json_file, "w") as f:
             json.dump(template_json, f, indent=4)
     else:

From 292ae27a1e3736df7958e7fdffb08f64ba463e8a Mon Sep 17 00:00:00 2001
From: Emma <ningqiongnq@gmail.com>
Date: Mon, 13 May 2024 17:32:51 -0700
Subject: [PATCH 07/20] Test for phi3

---
 examples/phi3/README.md          |  48 +++++++
 examples/phi3/phi3.py            | 212 +++++++++++++++++++++++++++++++
 examples/phi3/phi3_template.json |  36 ++++++
 examples/phi3/requirements.txt   |   5 +
 4 files changed, 301 insertions(+)
 create mode 100644 examples/phi3/README.md
 create mode 100644 examples/phi3/phi3.py
 create mode 100644 examples/phi3/phi3_template.json
 create mode 100644 examples/phi3/requirements.txt

diff --git a/examples/phi3/README.md b/examples/phi3/README.md
new file mode 100644
index 000000000..704dc8632
--- /dev/null
+++ b/examples/phi3/README.md
@@ -0,0 +1,48 @@
+# Phi3 optimization with Olive
+This folder contains an example of optimizing [the Phi-3-Mini-4K-Instruct](https://ai.azure.com/explore/models/Phi-3-mini-4k-instruct/version/5/registry/azureml?tid=72f988bf-86f1-41af-91ab-2d7cd011db47) model in Azure ML Model Catalog for different hardware targets with Olive. 
+
+
+## Prerequisites
+* einops
+* Pytorch: >=2.2.0 \
+  _The [official website](https://pytorch.org/) offers packages compatible with CUDA 11.8 and 12.1. Please select the appropriate version according to your needs._
+* [Package onnxruntime](https://onnxruntime.ai/docs/install/#inference-install-table-for-all-languages): >=1.18.0
+* [Package onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai): >=0.2.0. If you target GPU, pls install onnxruntime and onnxruntime-genai gpu packages. 
+
+Install the dependencies
+```
+pip install -r requirements.txt
+```
+
+## Usage
+we will use the `phi3.py` script to generate optimized model for a chosen hardware target by running the following commands.
+
+```
+python phi3.py [--target HARDWARE_TARGET] [--precision DATA_TYPE] [--inference] [--prompt PROMPT] [--max_length LENGTH]
+
+# Examples
+python phi3.py --target web
+
+python phi3.py --target mobile --inference --prompt "Write a story starting with once upon a time" --max_length 200
+```
+
+- `--target`: cpu, cuda, mobile, web
+- `--precision`: optional. fp32, fp16, int4. fp32 or int4(default) for cpu target; fp32 or fp16 or int4(default) for gpu target; int4(default) for mobile or web
+- `--inference`: run the optimized model, for non-web models inference.
+- `--prompt`: optional, the prompt text fed into the model. Take effect only when `--inference` is set.
+- `--max_length`: optional, the max length of the output from the model. Take effect only when `--inference` is set.
+
+
+This script includes 
+1. Generate the Olive configuration file for your need including the chosen HW target, the preferred model precision. 
+2. Generate optimized model with Olive based on the configuration file for the chosen HW target
+3. (optional) Inference the optimized model with ONNX Runtime Generation API. Not supported for web target 
+
+
+If you have an Olive configuration file, you can also run the olive command for model generation:
+```
+olive run [--config CONFIGURATION_FILE]
+
+# Examples
+olive run --config phi3_mobile_int4.json
+```
diff --git a/examples/phi3/phi3.py b/examples/phi3/phi3.py
new file mode 100644
index 000000000..6266276a8
--- /dev/null
+++ b/examples/phi3/phi3.py
@@ -0,0 +1,212 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import argparse
+import json
+import platform
+import time
+from pathlib import Path
+
+from onnxruntime import __version__ as OrtVersion
+import onnxruntime_genai as og
+from packaging import version
+
+from olive.workflows import run as olive_run
+
+
+TARGETS = ["cpu", "gpu", "mobile", "web"]
+
+TARGET_TO_EP = {
+    "cpu": "CPUExecutionProvider",
+    "mobile": "CPUExecutionProvider",
+    "gpu": "CUDAExecutionProvider",
+    "web": "JsExecutionProvider",
+}
+
+
+def get_args(raw_args):
+    parser = argparse.ArgumentParser(description="phi3 optimization")
+
+    parser.add_argument(
+        "--target",
+        type=str,
+        default=None,
+        choices=["cpu", "cuda", "mobile", "web"],
+        help="Choose from cpu, cuda, mobile or web",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default=None,
+        choices=["fp32", "fp16", "int4"],
+        help="Choose from fp32 or int4(default) for cpu target; fp32 or fp16 or int4(default) for gpu target; int4(default) for mobile or web",
+    )
+    parser.add_argument(
+        "--inference",
+        action="store_true",
+        help="Run inference with optimized model",
+    )
+    parser.add_argument(
+        "--prompt",
+        nargs="*",
+        type=str,
+        default=["Write a joke"],
+        help="The prompt text fed into the model. Not supported with Web target.",
+    )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=200,
+        help="Max length for generation. Not supported with Web target.",
+    )
+    
+    return parser.parse_args(raw_args)
+
+
+def main(raw_args=None):
+    args = get_args(raw_args)
+    if not args.target:
+        raise ValueError("Please specify target")
+
+    if not args.precision:
+        args.precision = "int4"
+    elif(args.target == "mobile" or args.target == "web") and not(args.precision == "int4"):
+        raise ValueError("mobile or web only supports int4(default)")
+    elif args.target == "cpu" and args.precision == "fp16":
+        raise ValueError("Choose from fp32 or int4(default) for cpu target")
+
+    if args.inference and args.target == "web":
+        raise ValueError("Web model inference is not supported in this script")
+
+    #Generate Olive configuration file for specific target
+    print("\nGenerating Olive configuration file...")
+    config_file = generate_config(args)
+    print("Olive configuration file is generated...\n")
+
+    # Generate optimized model for specific target 
+    print("Generating optimized model for", args.target, " ...\n")
+    footprints = olive_run(config_file)
+    if footprints:
+        print("\nOptimized model is generated...")
+
+    if args.inference:
+        prompts = "Write a joke" if not args.prompt else ''.join(args.prompt)
+
+        chat_template = "<|user|>\n{input}<|end|>\n<|assistant|>"
+        prompts = f'{chat_template.format(input=prompts)}'
+
+
+        max_length = 200 if not args.max_length else args.max_length
+
+        output_model_path = get_output_model_path(footprints)
+        genai_run(prompts, str(output_model_path), max_length)
+
+
+
+def generate_config(args):
+
+    json_file_template = "phi3_template.json"
+    with open(json_file_template) as f:
+            template_json = json.load(f)
+
+    target = str(args.target)
+    device = "GPU" if target == "cuda" or target == "web" else "CPU"
+    execution_providers = [TARGET_TO_EP[target.lower()]]
+    template_json["systems"]["local_system"]["config"]["accelerators"] = [
+        {"device": device, "execution_providers": execution_providers}
+    ]
+
+    model_builder = {
+        "type": "ModelBuilder",
+        "config": {"precision": args.precision,}
+        }
+    template_json["passes"]["builder"] = model_builder
+
+    if target == "mobile":
+        template_json["passes"]["builder"]["config"]["int4_accuracy_level"] = 4
+
+    elif target == "web":
+        fl_type = {"type": "OnnxIOFloat16ToFloat32"}
+        template_json["passes"]["fp32_logits"] = fl_type
+
+    new_json_file = f"phi3_{target.lower()}_{args.precision}.json"
+    with open(new_json_file, "w") as f:
+        json.dump(template_json, f, indent=4)
+
+    return new_json_file
+
+
+def get_output_model_path(footprints):
+    # only one model output in phi2 optimization
+    for footprint in footprints.values():
+        for model_id in footprint.nodes:
+            model_path = Path(footprint.get_model_path(model_id))
+            break
+    return model_path
+
+
+def genai_run(prompt, model_path, max_length):
+
+    print("\nModel inference starts...")
+    
+    print("Loading model...")
+    app_started_timestamp = time.time()
+    model = og.Model(model_path)
+    model_loaded_timestamp = time.time()
+    print("Model loaded in {:.2f} seconds".format(model_loaded_timestamp - app_started_timestamp))
+
+    print("Creating tokenizer...")
+    tokenizer = og.Tokenizer(model)
+    tokenizer_stream = tokenizer.create_stream()
+    input_tokens = tokenizer.encode(prompt)
+    started_timestamp = time.time()
+
+    print("Creating generator ...")
+    params = og.GeneratorParams(model)
+    #optimal search options for Phi3
+    search_options = {
+        'max_length': max_length,
+        'top_k': 40,
+        'top_p': 0.95,
+        'temperature': 0.8,
+        'repetition_penalty':1.0
+        }
+    params.set_search_options(**search_options)
+    params.input_ids = input_tokens
+    generator = og.Generator(model, params)
+    print("Generator created")
+
+    first = True
+    new_tokens = []
+
+    print("\n", prompt)
+
+    try:
+        while not generator.is_done():
+            generator.compute_logits()
+            generator.generate_next_token()
+            if first:
+                first_token_timestamp = time.time()
+                first = False
+
+            new_token = generator.get_next_tokens()[0]
+            print(tokenizer_stream.decode(new_token), end="", flush=True)
+            new_tokens.append(new_token)
+    except KeyboardInterrupt:
+            print("  --control+c pressed, aborting generation--")
+    
+    del generator
+
+    run_time = time.time() - started_timestamp
+    print("\n\n"
+        f"Prompt tokens: {len(input_tokens)}, New tokens: {len(new_tokens)},"
+        f" Time to first: {(first_token_timestamp - started_timestamp):.2f}s,"
+        f" New tokens per second: {len(new_tokens)/run_time:.2f} tps"
+    )
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/phi3/phi3_template.json b/examples/phi3/phi3_template.json
new file mode 100644
index 000000000..71411c11e
--- /dev/null
+++ b/examples/phi3/phi3_template.json
@@ -0,0 +1,36 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "microsoft/Phi-3-mini-4k-instruct",
+                "task": "text-generation",
+                "from_pretrained_args": {
+                    "trust_remote_code": true
+                }
+            }
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "CPU",
+                        "execution_providers": [
+                            "CPUExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "passes": {
+        
+    },
+    "engine": {
+        "cache_dir": "cache",
+        "output_dir": "Opt_model"
+    }
+}
diff --git a/examples/phi3/requirements.txt b/examples/phi3/requirements.txt
new file mode 100644
index 000000000..3f5cc1ebf
--- /dev/null
+++ b/examples/phi3/requirements.txt
@@ -0,0 +1,5 @@
+einops
+onnx>=1.15.0
+onnxscript>=0.1.0.dev20240126
+torch>=2.2.0
+transformers>=4.36.2

From 9d01919a90411d2ebb553e1ec20efdaefdc1150e Mon Sep 17 00:00:00 2001
From: Emma <ningqiongnq@gmail.com>
Date: Mon, 13 May 2024 17:38:08 -0700
Subject: [PATCH 08/20] Update readme

---
 examples/phi3/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/phi3/README.md b/examples/phi3/README.md
index 704dc8632..6544b4de6 100644
--- a/examples/phi3/README.md
+++ b/examples/phi3/README.md
@@ -1,5 +1,5 @@
 # Phi3 optimization with Olive
-This folder contains an example of optimizing [the Phi-3-Mini-4K-Instruct](https://ai.azure.com/explore/models/Phi-3-mini-4k-instruct/version/5/registry/azureml?tid=72f988bf-86f1-41af-91ab-2d7cd011db47) model in Azure ML Model Catalog for different hardware targets with Olive. 
+This folder contains an example of optimizing [the Phi-3-Mini-4K-Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) model in HF for different hardware targets with Olive. 
 
 
 ## Prerequisites

From cff45a69ffeb6cde8b57c27094fe197b21fa5b67 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 14 May 2024 01:30:59 +0000
Subject: [PATCH 09/20] fix lint warnings

---
 docs/source/features/passes/onnx.md     |  2 +-
 examples/phi3/README.md                 | 10 ++---
 examples/phi3/phi3.py                   | 56 ++++++++++++-------------
 examples/phi3/phi3_template.json        |  2 +-
 olive/passes/onnx/float32_conversion.py |  2 +-
 5 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/docs/source/features/passes/onnx.md b/docs/source/features/passes/onnx.md
index a22bc30a4..60c3b6730 100644
--- a/docs/source/features/passes/onnx.md
+++ b/docs/source/features/passes/onnx.md
@@ -430,7 +430,7 @@ See [Float16 Conversion](https://onnxruntime.ai/docs/performance/model-optimizat
 
 ## Inputs/Outputs Float16 to Float32 Conversion
 
-Certain environments such as Onnxruntime WebGPU prefers Float32 logits. The `OnnxIOFloat16ToFloat32` pass converts the inputs and outputs to use Float16 instead of Float32.
+Certain environments such as Onnxruntime WebGPU prefers Float32 logits. The `OnnxIOFloat16ToFloat32` pass converts the inputs and outputs to use Float32 instead of Float16.
 
 ### Example Configuration
 
diff --git a/examples/phi3/README.md b/examples/phi3/README.md
index 6544b4de6..291a5c23e 100644
--- a/examples/phi3/README.md
+++ b/examples/phi3/README.md
@@ -1,5 +1,5 @@
 # Phi3 optimization with Olive
-This folder contains an example of optimizing [the Phi-3-Mini-4K-Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) model in HF for different hardware targets with Olive. 
+This folder contains an example of optimizing [the Phi-3-Mini-4K-Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) model in HF for different hardware targets with Olive.
 
 
 ## Prerequisites
@@ -7,7 +7,7 @@ This folder contains an example of optimizing [the Phi-3-Mini-4K-Instruct](https
 * Pytorch: >=2.2.0 \
   _The [official website](https://pytorch.org/) offers packages compatible with CUDA 11.8 and 12.1. Please select the appropriate version according to your needs._
 * [Package onnxruntime](https://onnxruntime.ai/docs/install/#inference-install-table-for-all-languages): >=1.18.0
-* [Package onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai): >=0.2.0. If you target GPU, pls install onnxruntime and onnxruntime-genai gpu packages. 
+* [Package onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai): >=0.2.0. If you target GPU, pls install onnxruntime and onnxruntime-genai gpu packages.
 
 Install the dependencies
 ```
@@ -33,10 +33,10 @@ python phi3.py --target mobile --inference --prompt "Write a story starting with
 - `--max_length`: optional, the max length of the output from the model. Take effect only when `--inference` is set.
 
 
-This script includes 
-1. Generate the Olive configuration file for your need including the chosen HW target, the preferred model precision. 
+This script includes
+1. Generate the Olive configuration file for your need including the chosen HW target, the preferred model precision.
 2. Generate optimized model with Olive based on the configuration file for the chosen HW target
-3. (optional) Inference the optimized model with ONNX Runtime Generation API. Not supported for web target 
+3. (optional) Inference the optimized model with ONNX Runtime Generation API. Not supported for web target
 
 
 If you have an Olive configuration file, you can also run the olive command for model generation:
diff --git a/examples/phi3/phi3.py b/examples/phi3/phi3.py
index 6266276a8..e81831b8a 100644
--- a/examples/phi3/phi3.py
+++ b/examples/phi3/phi3.py
@@ -5,16 +5,15 @@
 
 import argparse
 import json
-import platform
 import time
 from pathlib import Path
 
-from onnxruntime import __version__ as OrtVersion
 import onnxruntime_genai as og
-from packaging import version
 
 from olive.workflows import run as olive_run
 
+# flake8: noqa: T201
+
 
 TARGETS = ["cpu", "gpu", "mobile", "web"]
 
@@ -41,7 +40,8 @@ def get_args(raw_args):
         type=str,
         default=None,
         choices=["fp32", "fp16", "int4"],
-        help="Choose from fp32 or int4(default) for cpu target; fp32 or fp16 or int4(default) for gpu target; int4(default) for mobile or web",
+        help="Choose from fp32 or int4(default) for cpu target; "
+        "fp32 or fp16 or int4(default) for gpu target; int4(default) for mobile or web",
     )
     parser.add_argument(
         "--inference",
@@ -61,7 +61,7 @@ def get_args(raw_args):
         default=200,
         help="Max length for generation. Not supported with Web target.",
     )
-    
+
     return parser.parse_args(raw_args)
 
 
@@ -72,7 +72,7 @@ def main(raw_args=None):
 
     if not args.precision:
         args.precision = "int4"
-    elif(args.target == "mobile" or args.target == "web") and not(args.precision == "int4"):
+    elif args.target in ("mobile", "web") and args.precision != "int4":
         raise ValueError("mobile or web only supports int4(default)")
     elif args.target == "cpu" and args.precision == "fp16":
         raise ValueError("Choose from fp32 or int4(default) for cpu target")
@@ -80,23 +80,22 @@ def main(raw_args=None):
     if args.inference and args.target == "web":
         raise ValueError("Web model inference is not supported in this script")
 
-    #Generate Olive configuration file for specific target
+    # Generate Olive configuration file for specific target
     print("\nGenerating Olive configuration file...")
     config_file = generate_config(args)
     print("Olive configuration file is generated...\n")
 
-    # Generate optimized model for specific target 
+    # Generate optimized model for specific target
     print("Generating optimized model for", args.target, " ...\n")
     footprints = olive_run(config_file)
     if footprints:
         print("\nOptimized model is generated...")
 
     if args.inference:
-        prompts = "Write a joke" if not args.prompt else ''.join(args.prompt)
+        prompts = "Write a joke" if not args.prompt else "".join(args.prompt)
 
         chat_template = "<|user|>\n{input}<|end|>\n<|assistant|>"
-        prompts = f'{chat_template.format(input=prompts)}'
-
+        prompts = f"{chat_template.format(input=prompts)}"
 
         max_length = 200 if not args.max_length else args.max_length
 
@@ -104,15 +103,14 @@ def main(raw_args=None):
         genai_run(prompts, str(output_model_path), max_length)
 
 
-
 def generate_config(args):
 
     json_file_template = "phi3_template.json"
     with open(json_file_template) as f:
-            template_json = json.load(f)
+        template_json = json.load(f)
 
     target = str(args.target)
-    device = "GPU" if target == "cuda" or target == "web" else "CPU"
+    device = "GPU" if target in ("cuda", "web") else "CPU"
     execution_providers = [TARGET_TO_EP[target.lower()]]
     template_json["systems"]["local_system"]["config"]["accelerators"] = [
         {"device": device, "execution_providers": execution_providers}
@@ -120,8 +118,10 @@ def generate_config(args):
 
     model_builder = {
         "type": "ModelBuilder",
-        "config": {"precision": args.precision,}
-        }
+        "config": {
+            "precision": args.precision,
+        },
+    }
     template_json["passes"]["builder"] = model_builder
 
     if target == "mobile":
@@ -150,7 +150,7 @@ def get_output_model_path(footprints):
 def genai_run(prompt, model_path, max_length):
 
     print("\nModel inference starts...")
-    
+
     print("Loading model...")
     app_started_timestamp = time.time()
     model = og.Model(model_path)
@@ -165,14 +165,14 @@ def genai_run(prompt, model_path, max_length):
 
     print("Creating generator ...")
     params = og.GeneratorParams(model)
-    #optimal search options for Phi3
+    # optimal search options for Phi3
     search_options = {
-        'max_length': max_length,
-        'top_k': 40,
-        'top_p': 0.95,
-        'temperature': 0.8,
-        'repetition_penalty':1.0
-        }
+        "max_length": max_length,
+        "top_k": 40,
+        "top_p": 0.95,
+        "temperature": 0.8,
+        "repetition_penalty": 1.0,
+    }
     params.set_search_options(**search_options)
     params.input_ids = input_tokens
     generator = og.Generator(model, params)
@@ -195,18 +195,18 @@ def genai_run(prompt, model_path, max_length):
             print(tokenizer_stream.decode(new_token), end="", flush=True)
             new_tokens.append(new_token)
     except KeyboardInterrupt:
-            print("  --control+c pressed, aborting generation--")
-    
+        print("  --control+c pressed, aborting generation--")
+
     del generator
 
     run_time = time.time() - started_timestamp
-    print("\n\n"
+    print(
+        "\n\n"
         f"Prompt tokens: {len(input_tokens)}, New tokens: {len(new_tokens)},"
         f" Time to first: {(first_token_timestamp - started_timestamp):.2f}s,"
         f" New tokens per second: {len(new_tokens)/run_time:.2f} tps"
     )
 
 
-
 if __name__ == "__main__":
     main()
diff --git a/examples/phi3/phi3_template.json b/examples/phi3/phi3_template.json
index 71411c11e..a976ff2fe 100644
--- a/examples/phi3/phi3_template.json
+++ b/examples/phi3/phi3_template.json
@@ -27,7 +27,7 @@
         }
     },
     "passes": {
-        
+
     },
     "engine": {
         "cache_dir": "cache",
diff --git a/olive/passes/onnx/float32_conversion.py b/olive/passes/onnx/float32_conversion.py
index 211a5ce12..9e375df37 100644
--- a/olive/passes/onnx/float32_conversion.py
+++ b/olive/passes/onnx/float32_conversion.py
@@ -66,7 +66,7 @@ def wrap_inputs(self, graph, i_map, names):
                 "Cast",
                 inputs=[i.name],
                 outputs=[i.name + "_fp16"],
-                to=onnx.TensorProto.FLOAT,
+                to=onnx.TensorProto.FLOAT16,
             )
             graph.node.insert(0, cast)
             i.type.tensor_type.elem_type = onnx.TensorProto.FLOAT

From 029fbb96e7ab9692e5b812b4c8ffbb8d370c1b1d Mon Sep 17 00:00:00 2001
From: Devang Patel <47577486+devang-ml@users.noreply.github.com>
Date: Tue, 14 May 2024 08:49:01 -0700
Subject: [PATCH 10/20] Fix doc build

---
 docs/source/api/passes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/api/passes.rst b/docs/source/api/passes.rst
index faf6e577e..1cc19b155 100644
--- a/docs/source/api/passes.rst
+++ b/docs/source/api/passes.rst
@@ -45,7 +45,7 @@ OnnxFloatToFloat16
 .. _onnx_io_float16_to_float32:
 
 OnnxIOFloat16ToFloat32
---------------------
+----------------------
 .. autoconfigclass:: olive.passes.OnnxIOFloat16ToFloat32
 
 .. _ort_mixed_precision:

From 158469316655148d3720d316bfff8f5c5af76eef Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 14 May 2024 16:57:24 +0000
Subject: [PATCH 11/20] fix format

---
 examples/phi3/phi3.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/phi3/phi3.py b/examples/phi3/phi3.py
index e81831b8a..ca414fe26 100644
--- a/examples/phi3/phi3.py
+++ b/examples/phi3/phi3.py
@@ -179,6 +179,7 @@ def genai_run(prompt, model_path, max_length):
     print("Generator created")
 
     first = True
+    first_token_timestamp = None
     new_tokens = []
 
     print("\n", prompt)
@@ -200,12 +201,15 @@ def genai_run(prompt, model_path, max_length):
     del generator
 
     run_time = time.time() - started_timestamp
-    print(
-        "\n\n"
-        f"Prompt tokens: {len(input_tokens)}, New tokens: {len(new_tokens)},"
-        f" Time to first: {(first_token_timestamp - started_timestamp):.2f}s,"
-        f" New tokens per second: {len(new_tokens)/run_time:.2f} tps"
-    )
+    if first_token_timestamp is None:
+        print("\n\nNo tokens generated")
+    else:
+        print(
+            "\n\n"
+            f"Prompt tokens: {len(input_tokens)}, New tokens: {len(new_tokens)},"
+            f" Time to first: {(first_token_timestamp - started_timestamp):.2f}s,"
+            f" New tokens per second: {len(new_tokens)/run_time:.2f} tps"
+        )
 
 
 if __name__ == "__main__":

From fc722c7b6bfedec6525fd5d0c04909f79c268ee6 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 14 May 2024 17:15:41 +0000
Subject: [PATCH 12/20] fix format

---
 examples/directml/llm/run_llm_batched_io_binding.py         | 2 ++
 examples/directml/llm/run_llm_io_binding.py                 | 2 ++
 examples/directml/llm/run_vision_llm_io_binding.py          | 2 ++
 olive/passes/onnx/vitis_ai_quantization.py                  | 1 +
 olive/passes/openvino/quantization.py                       | 1 +
 olive/strategy/search_space.py                              | 1 +
 test/unit_test/engine/packaging/test_packaging_generator.py | 1 +
 7 files changed, 10 insertions(+)

diff --git a/examples/directml/llm/run_llm_batched_io_binding.py b/examples/directml/llm/run_llm_batched_io_binding.py
index d46524c21..672e62bd0 100644
--- a/examples/directml/llm/run_llm_batched_io_binding.py
+++ b/examples/directml/llm/run_llm_batched_io_binding.py
@@ -59,6 +59,8 @@ def run_llm_io_binding(
 
     data_type = np.float16
     num_layers = 0
+    num_key_value_heads = None
+    head_dim = None
     for inputs_meta in llm_session._inputs_meta:  # pylint: disable=protected-access
         if inputs_meta.name.startswith("past_key_values.") and inputs_meta.name.endswith(".key"):
             num_layers += 1
diff --git a/examples/directml/llm/run_llm_io_binding.py b/examples/directml/llm/run_llm_io_binding.py
index 5fe60ef87..1dba442ce 100644
--- a/examples/directml/llm/run_llm_io_binding.py
+++ b/examples/directml/llm/run_llm_io_binding.py
@@ -62,6 +62,8 @@ def run_llm_io_binding(
 
     data_type = np.float16
     num_layers = 0
+    num_key_value_heads = None
+    head_dim = None
     for inputs_meta in llm_session._inputs_meta:  # pylint: disable=protected-access
         if inputs_meta.name.startswith("past_key_values.") and inputs_meta.name.endswith(".key"):
             num_layers += 1
diff --git a/examples/directml/llm/run_vision_llm_io_binding.py b/examples/directml/llm/run_vision_llm_io_binding.py
index 8f786fbed..bdfe8d95b 100644
--- a/examples/directml/llm/run_vision_llm_io_binding.py
+++ b/examples/directml/llm/run_vision_llm_io_binding.py
@@ -54,6 +54,8 @@ def run_vision_llm_io_binding(
 
     data_type = np.float16
     num_layers = 0
+    num_key_value_heads = None
+    head_dim = None
     for inputs_meta in llm_session._inputs_meta:  # pylint: disable=protected-access
         if inputs_meta.name.startswith("past_key_values.") and inputs_meta.name.endswith(".key"):
             num_layers += 1
diff --git a/olive/passes/onnx/vitis_ai_quantization.py b/olive/passes/onnx/vitis_ai_quantization.py
index 955d9edfc..d4f795394 100644
--- a/olive/passes/onnx/vitis_ai_quantization.py
+++ b/olive/passes/onnx/vitis_ai_quantization.py
@@ -339,6 +339,7 @@ def _run_for_config(
 
         # get the dataloader
         # TODO(XiaoSheng): only use data config
+        dataloader = None
         if config["dataloader_func"]:
             data_dir = get_local_path_from_root(data_root, config["data_dir"])
             dataloader = self._user_module_loader.call_object(
diff --git a/olive/passes/openvino/quantization.py b/olive/passes/openvino/quantization.py
index 0738d7bfd..4b8d73bd3 100644
--- a/olive/passes/openvino/quantization.py
+++ b/olive/passes/openvino/quantization.py
@@ -175,6 +175,7 @@ def _get_nncf_dataset(self, config, data_root):
         except ImportError:
             raise ImportError("Please install olive-ai[openvino] to use OpenVINO pass") from None
 
+        data_loader = None
         if config["dataloader_func"]:
             data_dir = get_local_path_from_root(data_root, config["data_dir"])
             data_loader = self._user_module_loader.call_object(
diff --git a/olive/strategy/search_space.py b/olive/strategy/search_space.py
index 11af8bcf4..35c93ef02 100644
--- a/olive/strategy/search_space.py
+++ b/olive/strategy/search_space.py
@@ -44,6 +44,7 @@ def random_sample(self) -> Dict[str, Dict[str, Any]]:
         search_point = deepcopy(self._empty_search_point)
 
         # sample from search space
+        options = None
         for space_name, param_name in self._iter_order:
             param = self._search_space[space_name][param_name]
             if isinstance(param, Conditional):
diff --git a/test/unit_test/engine/packaging/test_packaging_generator.py b/test/unit_test/engine/packaging/test_packaging_generator.py
index f9d69c3ae..4e59bb816 100644
--- a/test/unit_test/engine/packaging/test_packaging_generator.py
+++ b/test/unit_test/engine/packaging/test_packaging_generator.py
@@ -384,6 +384,7 @@ def test_azureml_deployment(mock_retry_func, inferencing_server_type):
     model_package_mock = Mock()
 
     inferencing_server = None
+    deployment = None
     if inferencing_server_type == InferencingServerType.AzureMLOnline:
         inferencing_server = AzureMLOnlineInferencingServer(code_configuration=code_configuration)
         deployment = ManagedOnlineDeployment(

From bc81a2c4d3d7a1ad710ded9dea890a66a4831b5a Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 14 May 2024 18:25:28 +0000
Subject: [PATCH 13/20] fix foramt

---
 examples/gptj/user_script.py       | 24 ++++-------
 examples/open_llama/user_script.py | 68 ++++++++++++++----------------
 2 files changed, 41 insertions(+), 51 deletions(-)

diff --git a/examples/gptj/user_script.py b/examples/gptj/user_script.py
index a33740b13..e08ead38f 100644
--- a/examples/gptj/user_script.py
+++ b/examples/gptj/user_script.py
@@ -55,11 +55,8 @@ def collate_batch(self, batch):
         return (torch.vstack(input_ids_padded), torch.vstack(attention_mask_padded)), torch.tensor(last_ind)
 
     def __iter__(self):
-        try:
-            for (input_ids, _attention_mask), last_ind in self.dataloader:
-                yield input_ids, last_ind
-        except StopIteration:
-            return
+        for (input_ids, _attention_mask), last_ind in self.dataloader:
+            yield input_ids, last_ind
 
     def __len__(self):
         return len(self.dataloader)
@@ -70,16 +67,13 @@ def __init__(self, pad_max=196, batch_size=1):
         super().__init__(pad_max, batch_size)
 
     def __iter__(self):
-        try:
-            for (input_ids, attention_mask), last_ind in self.dataloader:
-                data = [input_ids.detach().cpu().numpy().astype("int64")]
-                for _ in range(28):
-                    data.append(np.zeros((input_ids.shape[0], 16, 1, 256), dtype="float32"))
-                    data.append(np.zeros((input_ids.shape[0], 16, 1, 256), dtype="float32"))
-                data.append(attention_mask.detach().cpu().numpy().astype("int64"))
-                yield data, last_ind.detach().cpu().numpy()
-        except StopIteration:
-            return
+        for (input_ids, attention_mask), last_ind in self.dataloader:
+            data = [input_ids.detach().cpu().numpy().astype("int64")]
+            for _ in range(28):
+                data.append(np.zeros((input_ids.shape[0], 16, 1, 256), dtype="float32"))
+                data.append(np.zeros((input_ids.shape[0], 16, 1, 256), dtype="float32"))
+            data.append(attention_mask.detach().cpu().numpy().astype("int64"))
+            yield data, last_ind.detach().cpu().numpy()
 
 
 def create_pt_dataloader(data_dir, batch_size, *args, **kwargs):
diff --git a/examples/open_llama/user_script.py b/examples/open_llama/user_script.py
index e1e0eab8c..9a830dcb9 100644
--- a/examples/open_llama/user_script.py
+++ b/examples/open_llama/user_script.py
@@ -87,43 +87,39 @@ def __init__(self, model_path, batch_size=1, seqlen=2048, sub_folder="train"):
             self.sess = ort.InferenceSession(decoder_model_path, providers=["CPUExecutionProvider"])
 
     def __iter__(self):
-        try:
+        while True:
             while True:
-                while True:
-                    i = random.randint(0, len(self.dataset) - 1)
-                    trainenc = self.dataset[i]
-                    if trainenc["input_ids"].shape[0] > self.seqlen:
-                        break
-                i = random.randint(0, trainenc["input_ids"].shape[0] - self.seqlen - 1)
-                j = i + self.seqlen
-                inp = trainenc["input_ids"][i:j].unsqueeze(0)
-                mask = torch.ones(inp.shape)
-                if self.sess is None:
-                    yield {
-                        "input_ids": inp.detach().cpu().numpy().astype("int64"),
-                        "attention_mask": mask.detach().cpu().numpy().astype("int64"),
-                    }, 0
-                else:
-                    outputs = self.sess.run(
-                        None,
-                        {
-                            "input_ids": inp[:, :-1].detach().cpu().numpy().astype("int64"),
-                            "attention_mask": mask[:, :-1].detach().cpu().numpy().astype("int64"),
-                        },
-                    )
-                    ort_input = {}
-                    ort_input["input_ids"] = inp[:, -1].unsqueeze(0).detach().cpu().numpy().astype("int64")
-                    for layer_index in range(config.num_hidden_layers):
-                        ort_input[f"past_key_values.{layer_index}.key"] = outputs[layer_index * 2 + 1]
-                        ort_input[f"past_key_values.{layer_index}.value"] = outputs[layer_index * 2 + 2]
-
-                    ort_input["attention_mask"] = np.zeros(
-                        [self.batch_size, ort_input["past_key_values.0.key"].shape[2] + 1], dtype="int64"
-                    )
-                    yield ort_input, 0
-
-        except StopIteration:
-            return
+                i = random.randint(0, len(self.dataset) - 1)
+                trainenc = self.dataset[i]
+                if trainenc["input_ids"].shape[0] > self.seqlen:
+                    break
+            i = random.randint(0, trainenc["input_ids"].shape[0] - self.seqlen - 1)
+            j = i + self.seqlen
+            inp = trainenc["input_ids"][i:j].unsqueeze(0)
+            mask = torch.ones(inp.shape)
+            if self.sess is None:
+                yield {
+                    "input_ids": inp.detach().cpu().numpy().astype("int64"),
+                    "attention_mask": mask.detach().cpu().numpy().astype("int64"),
+                }, 0
+            else:
+                outputs = self.sess.run(
+                    None,
+                    {
+                        "input_ids": inp[:, :-1].detach().cpu().numpy().astype("int64"),
+                        "attention_mask": mask[:, :-1].detach().cpu().numpy().astype("int64"),
+                    },
+                )
+                ort_input = {}
+                ort_input["input_ids"] = inp[:, -1].unsqueeze(0).detach().cpu().numpy().astype("int64")
+                for layer_index in range(config.num_hidden_layers):
+                    ort_input[f"past_key_values.{layer_index}.key"] = outputs[layer_index * 2 + 1]
+                    ort_input[f"past_key_values.{layer_index}.value"] = outputs[layer_index * 2 + 2]
+
+                ort_input["attention_mask"] = np.zeros(
+                    [self.batch_size, ort_input["past_key_values.0.key"].shape[2] + 1], dtype="int64"
+                )
+                yield ort_input, 0
 
 
 def calib_dataloader(data_dir, batch_size, *args, **kwargs):

From 7b7050914cd3aa27f0db9ffe0563a6dda478ac64 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 14 May 2024 20:12:10 +0000
Subject: [PATCH 14/20] Revert "fix foramt"

This reverts commit bc81a2c4d3d7a1ad710ded9dea890a66a4831b5a.
---
 examples/gptj/user_script.py       | 24 +++++++----
 examples/open_llama/user_script.py | 68 ++++++++++++++++--------------
 2 files changed, 51 insertions(+), 41 deletions(-)

diff --git a/examples/gptj/user_script.py b/examples/gptj/user_script.py
index e08ead38f..a33740b13 100644
--- a/examples/gptj/user_script.py
+++ b/examples/gptj/user_script.py
@@ -55,8 +55,11 @@ def collate_batch(self, batch):
         return (torch.vstack(input_ids_padded), torch.vstack(attention_mask_padded)), torch.tensor(last_ind)
 
     def __iter__(self):
-        for (input_ids, _attention_mask), last_ind in self.dataloader:
-            yield input_ids, last_ind
+        try:
+            for (input_ids, _attention_mask), last_ind in self.dataloader:
+                yield input_ids, last_ind
+        except StopIteration:
+            return
 
     def __len__(self):
         return len(self.dataloader)
@@ -67,13 +70,16 @@ def __init__(self, pad_max=196, batch_size=1):
         super().__init__(pad_max, batch_size)
 
     def __iter__(self):
-        for (input_ids, attention_mask), last_ind in self.dataloader:
-            data = [input_ids.detach().cpu().numpy().astype("int64")]
-            for _ in range(28):
-                data.append(np.zeros((input_ids.shape[0], 16, 1, 256), dtype="float32"))
-                data.append(np.zeros((input_ids.shape[0], 16, 1, 256), dtype="float32"))
-            data.append(attention_mask.detach().cpu().numpy().astype("int64"))
-            yield data, last_ind.detach().cpu().numpy()
+        try:
+            for (input_ids, attention_mask), last_ind in self.dataloader:
+                data = [input_ids.detach().cpu().numpy().astype("int64")]
+                for _ in range(28):
+                    data.append(np.zeros((input_ids.shape[0], 16, 1, 256), dtype="float32"))
+                    data.append(np.zeros((input_ids.shape[0], 16, 1, 256), dtype="float32"))
+                data.append(attention_mask.detach().cpu().numpy().astype("int64"))
+                yield data, last_ind.detach().cpu().numpy()
+        except StopIteration:
+            return
 
 
 def create_pt_dataloader(data_dir, batch_size, *args, **kwargs):
diff --git a/examples/open_llama/user_script.py b/examples/open_llama/user_script.py
index 9a830dcb9..e1e0eab8c 100644
--- a/examples/open_llama/user_script.py
+++ b/examples/open_llama/user_script.py
@@ -87,39 +87,43 @@ def __init__(self, model_path, batch_size=1, seqlen=2048, sub_folder="train"):
             self.sess = ort.InferenceSession(decoder_model_path, providers=["CPUExecutionProvider"])
 
     def __iter__(self):
-        while True:
+        try:
             while True:
-                i = random.randint(0, len(self.dataset) - 1)
-                trainenc = self.dataset[i]
-                if trainenc["input_ids"].shape[0] > self.seqlen:
-                    break
-            i = random.randint(0, trainenc["input_ids"].shape[0] - self.seqlen - 1)
-            j = i + self.seqlen
-            inp = trainenc["input_ids"][i:j].unsqueeze(0)
-            mask = torch.ones(inp.shape)
-            if self.sess is None:
-                yield {
-                    "input_ids": inp.detach().cpu().numpy().astype("int64"),
-                    "attention_mask": mask.detach().cpu().numpy().astype("int64"),
-                }, 0
-            else:
-                outputs = self.sess.run(
-                    None,
-                    {
-                        "input_ids": inp[:, :-1].detach().cpu().numpy().astype("int64"),
-                        "attention_mask": mask[:, :-1].detach().cpu().numpy().astype("int64"),
-                    },
-                )
-                ort_input = {}
-                ort_input["input_ids"] = inp[:, -1].unsqueeze(0).detach().cpu().numpy().astype("int64")
-                for layer_index in range(config.num_hidden_layers):
-                    ort_input[f"past_key_values.{layer_index}.key"] = outputs[layer_index * 2 + 1]
-                    ort_input[f"past_key_values.{layer_index}.value"] = outputs[layer_index * 2 + 2]
-
-                ort_input["attention_mask"] = np.zeros(
-                    [self.batch_size, ort_input["past_key_values.0.key"].shape[2] + 1], dtype="int64"
-                )
-                yield ort_input, 0
+                while True:
+                    i = random.randint(0, len(self.dataset) - 1)
+                    trainenc = self.dataset[i]
+                    if trainenc["input_ids"].shape[0] > self.seqlen:
+                        break
+                i = random.randint(0, trainenc["input_ids"].shape[0] - self.seqlen - 1)
+                j = i + self.seqlen
+                inp = trainenc["input_ids"][i:j].unsqueeze(0)
+                mask = torch.ones(inp.shape)
+                if self.sess is None:
+                    yield {
+                        "input_ids": inp.detach().cpu().numpy().astype("int64"),
+                        "attention_mask": mask.detach().cpu().numpy().astype("int64"),
+                    }, 0
+                else:
+                    outputs = self.sess.run(
+                        None,
+                        {
+                            "input_ids": inp[:, :-1].detach().cpu().numpy().astype("int64"),
+                            "attention_mask": mask[:, :-1].detach().cpu().numpy().astype("int64"),
+                        },
+                    )
+                    ort_input = {}
+                    ort_input["input_ids"] = inp[:, -1].unsqueeze(0).detach().cpu().numpy().astype("int64")
+                    for layer_index in range(config.num_hidden_layers):
+                        ort_input[f"past_key_values.{layer_index}.key"] = outputs[layer_index * 2 + 1]
+                        ort_input[f"past_key_values.{layer_index}.value"] = outputs[layer_index * 2 + 2]
+
+                    ort_input["attention_mask"] = np.zeros(
+                        [self.batch_size, ort_input["past_key_values.0.key"].shape[2] + 1], dtype="int64"
+                    )
+                    yield ort_input, 0
+
+        except StopIteration:
+            return
 
 
 def calib_dataloader(data_dir, batch_size, *args, **kwargs):

From ff677b61320f2545f8354c179b2b4116434638a2 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 14 May 2024 20:12:31 +0000
Subject: [PATCH 15/20] Revert "fix format"

This reverts commit fc722c7b6bfedec6525fd5d0c04909f79c268ee6.
---
 examples/directml/llm/run_llm_batched_io_binding.py         | 2 --
 examples/directml/llm/run_llm_io_binding.py                 | 2 --
 examples/directml/llm/run_vision_llm_io_binding.py          | 2 --
 olive/passes/onnx/vitis_ai_quantization.py                  | 1 -
 olive/passes/openvino/quantization.py                       | 1 -
 olive/strategy/search_space.py                              | 1 -
 test/unit_test/engine/packaging/test_packaging_generator.py | 1 -
 7 files changed, 10 deletions(-)

diff --git a/examples/directml/llm/run_llm_batched_io_binding.py b/examples/directml/llm/run_llm_batched_io_binding.py
index 672e62bd0..d46524c21 100644
--- a/examples/directml/llm/run_llm_batched_io_binding.py
+++ b/examples/directml/llm/run_llm_batched_io_binding.py
@@ -59,8 +59,6 @@ def run_llm_io_binding(
 
     data_type = np.float16
     num_layers = 0
-    num_key_value_heads = None
-    head_dim = None
     for inputs_meta in llm_session._inputs_meta:  # pylint: disable=protected-access
         if inputs_meta.name.startswith("past_key_values.") and inputs_meta.name.endswith(".key"):
             num_layers += 1
diff --git a/examples/directml/llm/run_llm_io_binding.py b/examples/directml/llm/run_llm_io_binding.py
index 1dba442ce..5fe60ef87 100644
--- a/examples/directml/llm/run_llm_io_binding.py
+++ b/examples/directml/llm/run_llm_io_binding.py
@@ -62,8 +62,6 @@ def run_llm_io_binding(
 
     data_type = np.float16
     num_layers = 0
-    num_key_value_heads = None
-    head_dim = None
     for inputs_meta in llm_session._inputs_meta:  # pylint: disable=protected-access
         if inputs_meta.name.startswith("past_key_values.") and inputs_meta.name.endswith(".key"):
             num_layers += 1
diff --git a/examples/directml/llm/run_vision_llm_io_binding.py b/examples/directml/llm/run_vision_llm_io_binding.py
index bdfe8d95b..8f786fbed 100644
--- a/examples/directml/llm/run_vision_llm_io_binding.py
+++ b/examples/directml/llm/run_vision_llm_io_binding.py
@@ -54,8 +54,6 @@ def run_vision_llm_io_binding(
 
     data_type = np.float16
     num_layers = 0
-    num_key_value_heads = None
-    head_dim = None
     for inputs_meta in llm_session._inputs_meta:  # pylint: disable=protected-access
         if inputs_meta.name.startswith("past_key_values.") and inputs_meta.name.endswith(".key"):
             num_layers += 1
diff --git a/olive/passes/onnx/vitis_ai_quantization.py b/olive/passes/onnx/vitis_ai_quantization.py
index d4f795394..955d9edfc 100644
--- a/olive/passes/onnx/vitis_ai_quantization.py
+++ b/olive/passes/onnx/vitis_ai_quantization.py
@@ -339,7 +339,6 @@ def _run_for_config(
 
         # get the dataloader
         # TODO(XiaoSheng): only use data config
-        dataloader = None
         if config["dataloader_func"]:
             data_dir = get_local_path_from_root(data_root, config["data_dir"])
             dataloader = self._user_module_loader.call_object(
diff --git a/olive/passes/openvino/quantization.py b/olive/passes/openvino/quantization.py
index 4b8d73bd3..0738d7bfd 100644
--- a/olive/passes/openvino/quantization.py
+++ b/olive/passes/openvino/quantization.py
@@ -175,7 +175,6 @@ def _get_nncf_dataset(self, config, data_root):
         except ImportError:
             raise ImportError("Please install olive-ai[openvino] to use OpenVINO pass") from None
 
-        data_loader = None
         if config["dataloader_func"]:
             data_dir = get_local_path_from_root(data_root, config["data_dir"])
             data_loader = self._user_module_loader.call_object(
diff --git a/olive/strategy/search_space.py b/olive/strategy/search_space.py
index 35c93ef02..11af8bcf4 100644
--- a/olive/strategy/search_space.py
+++ b/olive/strategy/search_space.py
@@ -44,7 +44,6 @@ def random_sample(self) -> Dict[str, Dict[str, Any]]:
         search_point = deepcopy(self._empty_search_point)
 
         # sample from search space
-        options = None
         for space_name, param_name in self._iter_order:
             param = self._search_space[space_name][param_name]
             if isinstance(param, Conditional):
diff --git a/test/unit_test/engine/packaging/test_packaging_generator.py b/test/unit_test/engine/packaging/test_packaging_generator.py
index 4e59bb816..f9d69c3ae 100644
--- a/test/unit_test/engine/packaging/test_packaging_generator.py
+++ b/test/unit_test/engine/packaging/test_packaging_generator.py
@@ -384,7 +384,6 @@ def test_azureml_deployment(mock_retry_func, inferencing_server_type):
     model_package_mock = Mock()
 
     inferencing_server = None
-    deployment = None
     if inferencing_server_type == InferencingServerType.AzureMLOnline:
         inferencing_server = AzureMLOnlineInferencingServer(code_configuration=code_configuration)
         deployment = ManagedOnlineDeployment(

From a6541785e14d014706738faf1825471c5532acef Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 14 May 2024 20:12:47 +0000
Subject: [PATCH 16/20] Revert "fix format"

This reverts commit 158469316655148d3720d316bfff8f5c5af76eef.
---
 examples/phi3/phi3.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/examples/phi3/phi3.py b/examples/phi3/phi3.py
index ca414fe26..e81831b8a 100644
--- a/examples/phi3/phi3.py
+++ b/examples/phi3/phi3.py
@@ -179,7 +179,6 @@ def genai_run(prompt, model_path, max_length):
     print("Generator created")
 
     first = True
-    first_token_timestamp = None
     new_tokens = []
 
     print("\n", prompt)
@@ -201,15 +200,12 @@ def genai_run(prompt, model_path, max_length):
     del generator
 
     run_time = time.time() - started_timestamp
-    if first_token_timestamp is None:
-        print("\n\nNo tokens generated")
-    else:
-        print(
-            "\n\n"
-            f"Prompt tokens: {len(input_tokens)}, New tokens: {len(new_tokens)},"
-            f" Time to first: {(first_token_timestamp - started_timestamp):.2f}s,"
-            f" New tokens per second: {len(new_tokens)/run_time:.2f} tps"
-        )
+    print(
+        "\n\n"
+        f"Prompt tokens: {len(input_tokens)}, New tokens: {len(new_tokens)},"
+        f" Time to first: {(first_token_timestamp - started_timestamp):.2f}s,"
+        f" New tokens per second: {len(new_tokens)/run_time:.2f} tps"
+    )
 
 
 if __name__ == "__main__":

From ed12173fd4377d631124f3d7a8118384e8e62c05 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 14 May 2024 20:19:12 +0000
Subject: [PATCH 17/20] Revert "Revert "fix format""

This reverts commit a6541785e14d014706738faf1825471c5532acef.
---
 examples/phi3/phi3.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/phi3/phi3.py b/examples/phi3/phi3.py
index e81831b8a..ca414fe26 100644
--- a/examples/phi3/phi3.py
+++ b/examples/phi3/phi3.py
@@ -179,6 +179,7 @@ def genai_run(prompt, model_path, max_length):
     print("Generator created")
 
     first = True
+    first_token_timestamp = None
     new_tokens = []
 
     print("\n", prompt)
@@ -200,12 +201,15 @@ def genai_run(prompt, model_path, max_length):
     del generator
 
     run_time = time.time() - started_timestamp
-    print(
-        "\n\n"
-        f"Prompt tokens: {len(input_tokens)}, New tokens: {len(new_tokens)},"
-        f" Time to first: {(first_token_timestamp - started_timestamp):.2f}s,"
-        f" New tokens per second: {len(new_tokens)/run_time:.2f} tps"
-    )
+    if first_token_timestamp is None:
+        print("\n\nNo tokens generated")
+    else:
+        print(
+            "\n\n"
+            f"Prompt tokens: {len(input_tokens)}, New tokens: {len(new_tokens)},"
+            f" Time to first: {(first_token_timestamp - started_timestamp):.2f}s,"
+            f" New tokens per second: {len(new_tokens)/run_time:.2f} tps"
+        )
 
 
 if __name__ == "__main__":

From 02429be37cea3a2abc9fff5a7432c1d6f4cdf57f Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 14 May 2024 20:46:49 +0000
Subject: [PATCH 18/20] fix nit

---
 olive/passes/onnx/float32_conversion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/olive/passes/onnx/float32_conversion.py b/olive/passes/onnx/float32_conversion.py
index 9e375df37..c3386bdd6 100644
--- a/olive/passes/onnx/float32_conversion.py
+++ b/olive/passes/onnx/float32_conversion.py
@@ -43,7 +43,7 @@ def create_io_mapping(self, graph, i_map, o_map):
                 i_map[i].append(n)
         for n in graph.node:
             for o in n.output:
-                assert o not in o_map
+                assert o not in o_map[o]
                 o_map[o] = [n]
 
     def wrap_inputs(self, graph, i_map, names):

From 0e57a79f3dfd207d6f2e7fad83254ccbae941fe0 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 14 May 2024 20:59:18 +0000
Subject: [PATCH 19/20] fix nit

---
 examples/phi3/phi3.py            | 12 ++++--------
 examples/phi3/phi3_template.json |  4 +++-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/examples/phi3/phi3.py b/examples/phi3/phi3.py
index ca414fe26..7cfaa2eb0 100644
--- a/examples/phi3/phi3.py
+++ b/examples/phi3/phi3.py
@@ -15,7 +15,7 @@
 # flake8: noqa: T201
 
 
-TARGETS = ["cpu", "gpu", "mobile", "web"]
+TARGETS = ["cpu", "cuda", "mobile", "web"]
 
 TARGET_TO_EP = {
     "cpu": "CPUExecutionProvider",
@@ -32,13 +32,14 @@ def get_args(raw_args):
         "--target",
         type=str,
         default=None,
-        choices=["cpu", "cuda", "mobile", "web"],
+        required=True,
+        choices=TARGETS,
         help="Choose from cpu, cuda, mobile or web",
     )
     parser.add_argument(
         "--precision",
         type=str,
-        default=None,
+        default="int4",
         choices=["fp32", "fp16", "int4"],
         help="Choose from fp32 or int4(default) for cpu target; "
         "fp32 or fp16 or int4(default) for gpu target; int4(default) for mobile or web",
@@ -67,11 +68,6 @@ def get_args(raw_args):
 
 def main(raw_args=None):
     args = get_args(raw_args)
-    if not args.target:
-        raise ValueError("Please specify target")
-
-    if not args.precision:
-        args.precision = "int4"
     elif args.target in ("mobile", "web") and args.precision != "int4":
         raise ValueError("mobile or web only supports int4(default)")
     elif args.target == "cpu" and args.precision == "fp16":
diff --git a/examples/phi3/phi3_template.json b/examples/phi3/phi3_template.json
index a976ff2fe..3c11ac05d 100644
--- a/examples/phi3/phi3_template.json
+++ b/examples/phi3/phi3_template.json
@@ -31,6 +31,8 @@
     },
     "engine": {
         "cache_dir": "cache",
-        "output_dir": "Opt_model"
+        "output_dir": "Opt_model",
+        "host": "local_system",
+        "target": "local_system"
     }
 }

From ab94f2480cae18df07168666baba36e972a72af0 Mon Sep 17 00:00:00 2001
From: Xiaoyu <xiaoyuzhang@microsoft.com>
Date: Tue, 14 May 2024 22:12:36 +0000
Subject: [PATCH 20/20] fix nit

---
 examples/phi3/phi3.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/phi3/phi3.py b/examples/phi3/phi3.py
index 7cfaa2eb0..f01ca1313 100644
--- a/examples/phi3/phi3.py
+++ b/examples/phi3/phi3.py
@@ -20,7 +20,7 @@
 TARGET_TO_EP = {
     "cpu": "CPUExecutionProvider",
     "mobile": "CPUExecutionProvider",
-    "gpu": "CUDAExecutionProvider",
+    "cuda": "CUDAExecutionProvider",
     "web": "JsExecutionProvider",
 }
 
@@ -68,7 +68,7 @@ def get_args(raw_args):
 
 def main(raw_args=None):
     args = get_args(raw_args)
-    elif args.target in ("mobile", "web") and args.precision != "int4":
+    if args.target in ("mobile", "web") and args.precision != "int4":
         raise ValueError("mobile or web only supports int4(default)")
     elif args.target == "cpu" and args.precision == "fp16":
         raise ValueError("Choose from fp32 or int4(default) for cpu target")